import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
full_df = pd.read_csv('./airbnb-listings-extract.csv', sep=';', decimal='.')
# Filtrar por país (España) y crear una copia independiente
data_spain = full_df[full_df['Country Code'] == 'ES'].copy()
Para realizar un filtrado efectivo que incluya todas las variantes relevantes de Madrid y sus alrededores (queremos incluir no solo "Madrid" sino también otras localidades cercanas como "Pozuelo de Alarcón", "Boadilla del Monte", "Aravaca", entre otras), necesitaremos una función de filtrado detallada:
def is_madrid(location):
madrid_areas = ['madrid', 'pozuelo', 'boadilla del monte', 'aravaca', 'las matas', 'san fernando de henares', 'ventas', 'chueca', 'vallecas', 'villa verde', 'delicias']
if isinstance(location, str) and any(area in location.lower() for area in madrid_areas):
return True
return False
data_madrid = data_spain[data_spain['Smart Location'].apply(is_madrid)]
print(f"Valores únicos en 'Zipcode': {data_madrid['Zipcode'].unique()}")
Valores únicos en 'Zipcode': ['28007' '28001' '28006' '28009' '28028' '28002' '28016' nan '28046' '28039' '28020' '28025' '28041' '28026' '28019' '28018' '28053' '28038' '28030' '28010' '28003' '28035' '28008' '28013' '28034' '28040' '28023' '28011' '28024' '28044' '28021' '28031' '28032' '28037' '28042' '28224' '28017' '28027' '28043' '28033' '28050' '28045' '28012' '28005' '28014' '28036' '28004' '28029' '28015' '28105' '28051' '28022' '28055' '28054' '28047' '280013' '28094' '28002\n28002' '28850' '28660' '25008' '2015' '27004' '28060' '28056' '28290' '28049' '2805' '28052' '20013' '28048' '2815' '2802\n28012' 'Madrid 28004' '28830' '2804' '-' '28' '27013' '28058' '28051\n28051' '20126']
def clean_zipcode(zipcode):
if pd.isna(zipcode) or not isinstance(zipcode, str):
return None
zipcode = zipcode.replace('Madrid', '').strip()
zipcode = ''.join(filter(str.isdigit, zipcode)).split('\n')[0]
return zipcode if zipcode.startswith('28') and len(zipcode) == 5 else None
data_madrid.loc[:, 'Zipcode'] = data_madrid['Zipcode'].apply(clean_zipcode)
from sklearn.model_selection import train_test_split
# Dividimos en Conjuntos de Entrenamiento y Prueba
train, test = train_test_split(data_madrid, test_size=0.2, shuffle=True, random_state=0)
print(f'Dimensiones del dataset de training: {train.shape}')
print(f'Dimensiones del dataset de test: {test.shape}')
# Guardamos
train.to_csv('./train.csv', sep=';', decimal='.', index=False)
test.to_csv('./test.csv', sep=';', decimal='.', index=False)
# Cargamos el dataset de train y trabajamos ÚNICAMENTE con él
data_train = pd.read_csv('./train.csv', sep=';', decimal='.')
Dimensiones del dataset de training: (10609, 89) Dimensiones del dataset de test: (2653, 89)
Pasos que vamos a seguir:
.head()).info()).describe())pd.set_option('display.max_rows', None) # Configurar Pandas para mostrar todas las filas
pd.set_option('display.max_columns', None) # Configurar Pandas para mostrar todas las columnas
data_train.head().T
| 0 | 1 | 2 | 3 | 4 | |
|---|---|---|---|---|---|
| ID | 13448215 | 1046926 | 11633315 | 16801780 | 8667962 |
| Listing Url | https://www.airbnb.com/rooms/13448215 | https://www.airbnb.com/rooms/1046926 | https://www.airbnb.com/rooms/11633315 | https://www.airbnb.com/rooms/16801780 | https://www.airbnb.com/rooms/8667962 |
| Scrape ID | 20170407214119 | 20170407214119 | 20170407214119 | 20170407214119 | 20170407214119 |
| Last Scraped | 2017-04-08 | 2017-04-08 | 2017-04-08 | 2017-04-08 | 2017-04-08 |
| Name | Luxury Room Madrid Retiro Wifi | Single room in Madrid - WIFI | Apartment a step away from Gran Vía | Habitación doble en Lavapies | Comfortable Small Room & Private Bathroom- Cen... |
| Summary | This is a new 100sqm 2 bedroom apartment locat... | Single room in cozy apartment with 4 bedrooms ... | Wonderful apartment in the centre of Madrid, n... | Habitación doble con vistas al jardín del conv... | Confortable room for 1 but can allow up to 2 p... |
| Space | It's a master room with a 160cm real bed with ... | One single room for rent in a spacious apartme... | It´s on first floor. A very cozy and quiet apa... | El piso consta de dos habitaciones, un salón, ... | El apartamento es muy acogedor, luminoso, sile... |
| Description | This is a new 100sqm 2 bedroom apartment locat... | Single room in cozy apartment with 4 bedrooms ... | Wonderful apartment in the centre of Madrid, n... | Habitación doble con vistas al jardín del conv... | Confortable room for 1 but can allow up to 2 p... |
| Experiences Offered | none | none | none | none | none |
| Neighborhood Overview | The neighborhood is located in the city center... | La zona es tranquila y muy apropiada para los ... | Palacio neighborhood. A marvellous touristic a... | El barrio de Lavapies es único, por su ambient... | La zona es genial. Además de estar cerca de to... |
| Notes | I love non-smoking, respectful guests. No part... | NaN | NaN | Precio habitación doble (1 huésped): 30€ Preci... | El Carrefour Express que está al lado está muy... |
| Transit | It's very convenient to move around by bus or ... | El piso se encuentra a 10 min andando de la es... | Plaza de España metro station at only a few se... | Dispones a 2 minutos del metro de Lavapies, y ... | Tienes a dos minutos andando las líneas 2, 3 y... |
| Access | AC, heating, hot water. Fully equipped kitchen... | La cocina y el comedor, así como todos sus ute... | We speak perfect English, and we would be deli... | Durante tu alojamiento puedes disponer de todo... | Del apartamento puedes usar: El comedor para d... |
| Interaction | Landlord has quite a busy working schedule but... | Me gusta tener una relacion cercana con los in... | We speak perfect English, and we would be deli... | Por supuesto es que estaré disponible por si m... | Nosotros te recibiremos personalmente en nuest... |
| House Rules | - No smoking, no parties, respectful with neig... | Ninguna norma que impida sentirse cómodo en la... | Our main rule is that you make yourself feel a... | Vivo en casa con Fiona, una gata Maine Coon mu... | Somos una pareja tranquila que valora muchísim... |
| Thumbnail Url | NaN | https://a0.muscache.com/im/pictures/866cfc5e-f... | https://a0.muscache.com/im/pictures/f13148ba-c... | https://a0.muscache.com/im/pictures/c8acf003-1... | https://a0.muscache.com/im/pictures/ccc118e5-8... |
| Medium Url | NaN | https://a0.muscache.com/im/pictures/866cfc5e-f... | https://a0.muscache.com/im/pictures/f13148ba-c... | https://a0.muscache.com/im/pictures/c8acf003-1... | https://a0.muscache.com/im/pictures/ccc118e5-8... |
| Picture Url | https://public.opendatasoft.com/api/v2/catalog... | https://public.opendatasoft.com/api/v2/catalog... | https://public.opendatasoft.com/api/v2/catalog... | https://public.opendatasoft.com/api/v2/catalog... | https://public.opendatasoft.com/api/v2/catalog... |
| XL Picture Url | NaN | https://a0.muscache.com/im/pictures/866cfc5e-f... | https://a0.muscache.com/im/pictures/f13148ba-c... | https://a0.muscache.com/im/pictures/c8acf003-1... | https://a0.muscache.com/im/pictures/ccc118e5-8... |
| Host ID | 76682878 | 2913511 | 61647308 | 5842906 | 26551999 |
| Host URL | https://www.airbnb.com/users/show/76682878 | https://www.airbnb.com/users/show/2913511 | https://www.airbnb.com/users/show/61647308 | https://www.airbnb.com/users/show/5842906 | https://www.airbnb.com/users/show/26551999 |
| Host Name | Hai Wie | Amparo | Carmen Y Ricky | Cristina | Rocio Y Manuel |
| Host Since | 2016-06-09 | 2012-07-12 | 2016-03-05 | 2013-04-09 | 2015-01-22 |
| Host Location | Madrid, Community of Madrid, Spain | Madrid, Madrid, Spain | Madrid, Community of Madrid, Spain | Madrid, Community of Madrid, Spain | Madrid, Community of Madrid, Spain |
| Host About | I'm a Chinese-Spanish real estate consultant l... | Soy quien soy por las personas que me rodean y... | NaN | Arqueóloga, investigador postdoctoral y profes... | Somos una pareja de recién casados, muy alegre... |
| Host Response Time | within a day | within a few hours | within an hour | within an hour | within a few hours |
| Host Response Rate | 70.0 | 100.0 | 100.0 | 100.0 | 100.0 |
| Host Acceptance Rate | NaN | NaN | NaN | NaN | NaN |
| Host Thumbnail Url | https://a0.muscache.com/im/pictures/e8de5ad0-6... | https://a0.muscache.com/im/users/2913511/profi... | https://a0.muscache.com/im/pictures/663750d7-f... | https://a0.muscache.com/im/pictures/f0aad32a-2... | https://a0.muscache.com/im/pictures/95c78d4b-5... |
| Host Picture Url | https://a0.muscache.com/im/pictures/e8de5ad0-6... | https://a0.muscache.com/im/users/2913511/profi... | https://a0.muscache.com/im/pictures/663750d7-f... | https://a0.muscache.com/im/pictures/f0aad32a-2... | https://a0.muscache.com/im/pictures/95c78d4b-5... |
| Host Neighbourhood | NaN | Usera | Palacio | Embajadores | Malasaña |
| Host Listings Count | 1.0 | 2.0 | 1.0 | 1.0 | 1.0 |
| Host Total Listings Count | 1.0 | 2.0 | 1.0 | 1.0 | 1.0 |
| Host Verifications | email,phone,reviews | email,phone,reviews,jumio | email,phone,reviews,jumio,government_id | email,phone,facebook | email,phone,reviews,jumio |
| Street | Madrid, Comunidad de Madrid 28007, Spain | Usera, Madrid, Community of Madrid 28041, Spain | Palacio, Madrid, Comunidad de Madrid 28013, Spain | Embajadores, Madrid, Comunidad de Madrid 28012... | Malasaña, Madrid, Comunidad de Madrid 28015, S... |
| Neighbourhood | NaN | Usera | Palacio | Embajadores | Malasaña |
| Neighbourhood Cleansed | Adelfas | San Fermín | Palacio | Embajadores | Universidad |
| Neighbourhood Group Cleansed | Retiro | Usera | Centro | Centro | Centro |
| City | Madrid | Madrid | Madrid | Madrid | Madrid |
| State | Comunidad de Madrid | Community of Madrid | Comunidad de Madrid | Comunidad de Madrid | Comunidad de Madrid |
| Zipcode | 28007.0 | 28041.0 | 28013.0 | 28012.0 | 28015.0 |
| Market | Madrid | Madrid | Madrid | Madrid | Madrid |
| Smart Location | Madrid, Spain | Madrid, Spain | Madrid, Spain | Madrid, Spain | Madrid, Spain |
| Country Code | ES | ES | ES | ES | ES |
| Country | Spain | Spain | Spain | Spain | Spain |
| Latitude | 40.404759 | 40.366693 | 40.422478 | 40.407975 | 40.426449 |
| Longitude | -3.669628 | -3.687317 | -3.710156 | -3.698652 | -3.712392 |
| Property Type | Apartment | Apartment | Apartment | House | Apartment |
| Room Type | Private room | Private room | Entire home/apt | Private room | Private room |
| Accommodates | 2 | 1 | 4 | 2 | 2 |
| Bathrooms | 2.0 | 2.0 | 1.0 | 1.0 | 2.0 |
| Bedrooms | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 |
| Beds | 1.0 | 1.0 | 2.0 | 1.0 | 2.0 |
| Bed Type | Real Bed | Real Bed | Real Bed | Real Bed | Real Bed |
| Amenities | TV,Wireless Internet,Air conditioning,Pool,Kit... | TV,Wireless Internet,Wheelchair accessible,Kit... | TV,Wireless Internet,Air conditioning,Kitchen,... | TV,Internet,Wireless Internet,Kitchen,Pets all... | Internet,Wireless Internet,Air conditioning,Wh... |
| Square Feet | NaN | NaN | NaN | NaN | NaN |
| Price | 42.0 | 30.0 | 63.0 | 35.0 | 34.0 |
| Weekly Price | NaN | 170.0 | NaN | NaN | NaN |
| Monthly Price | NaN | 500.0 | NaN | NaN | NaN |
| Security Deposit | 300.0 | NaN | 100.0 | NaN | NaN |
| Cleaning Fee | 12.0 | NaN | NaN | NaN | NaN |
| Guests Included | 1 | 1 | 2 | 1 | 1 |
| Extra People | 0 | 30 | 10 | 10 | 5 |
| Minimum Nights | 2 | 2 | 2 | 1 | 2 |
| Maximum Nights | 1125 | 1125 | 1125 | 3 | 10 |
| Calendar Updated | 4 days ago | 2 weeks ago | 3 months ago | 4 days ago | today |
| Has Availability | NaN | NaN | NaN | NaN | NaN |
| Availability 30 | 11 | 29 | 3 | 20 | 17 |
| Availability 60 | 41 | 51 | 9 | 50 | 33 |
| Availability 90 | 71 | 77 | 18 | 74 | 58 |
| Availability 365 | 71 | 350 | 293 | 74 | 58 |
| Calendar last Scraped | 2017-04-08 | 2017-04-08 | 2017-04-07 | 2017-04-08 | 2017-04-08 |
| Number of Reviews | 4 | 14 | 81 | 4 | 62 |
| First Review | 2016-06-13 | 2014-05-12 | 2016-03-14 | 2017-02-12 | 2015-11-17 |
| Last Review | 2016-10-31 | 2017-02-12 | 2017-03-30 | 2017-04-02 | 2017-04-05 |
| Review Scores Rating | 93.0 | 94.0 | 95.0 | 100.0 | 96.0 |
| Review Scores Accuracy | 9.0 | 10.0 | 10.0 | 10.0 | 10.0 |
| Review Scores Cleanliness | 8.0 | 10.0 | 10.0 | 10.0 | 10.0 |
| Review Scores Checkin | 10.0 | 9.0 | 10.0 | 10.0 | 10.0 |
| Review Scores Communication | 10.0 | 10.0 | 10.0 | 10.0 | 10.0 |
| Review Scores Location | 9.0 | 9.0 | 10.0 | 10.0 | 10.0 |
| Review Scores Value | 9.0 | 9.0 | 10.0 | 10.0 | 10.0 |
| License | NaN | NaN | NaN | NaN | NaN |
| Jurisdiction Names | NaN | NaN | NaN | NaN | NaN |
| Cancellation Policy | moderate | moderate | moderate | flexible | moderate |
| Calculated host listings count | 1.0 | 2.0 | 1.0 | 1.0 | 1.0 |
| Reviews per Month | 0.4 | 0.4 | 6.21 | 2.14 | 3.65 |
| Geolocation | 40.4047594303,-3.66962841278 | 40.366693238,-3.68731740216 | 40.42247797,-3.71015640446 | 40.407974965,-3.69865192241 | 40.4264486151,-3.7123924029 |
| Features | Host Has Profile Pic,Requires License | Host Has Profile Pic,Host Identity Verified,Is... | Host Has Profile Pic,Host Identity Verified,Is... | Host Has Profile Pic,Is Location Exact,Require... | Host Has Profile Pic,Host Identity Verified,Is... |
data_train.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10609 entries, 0 to 10608 Data columns (total 89 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 10609 non-null int64 1 Listing Url 10609 non-null object 2 Scrape ID 10609 non-null int64 3 Last Scraped 10609 non-null object 4 Name 10609 non-null object 5 Summary 10223 non-null object 6 Space 7729 non-null object 7 Description 10605 non-null object 8 Experiences Offered 10609 non-null object 9 Neighborhood Overview 6645 non-null object 10 Notes 4036 non-null object 11 Transit 6617 non-null object 12 Access 6057 non-null object 13 Interaction 6055 non-null object 14 House Rules 6915 non-null object 15 Thumbnail Url 8624 non-null object 16 Medium Url 8624 non-null object 17 Picture Url 10595 non-null object 18 XL Picture Url 8624 non-null object 19 Host ID 10609 non-null int64 20 Host URL 10609 non-null object 21 Host Name 10607 non-null object 22 Host Since 10607 non-null object 23 Host Location 10580 non-null object 24 Host About 6623 non-null object 25 Host Response Time 9292 non-null object 26 Host Response Rate 9292 non-null float64 27 Host Acceptance Rate 0 non-null float64 28 Host Thumbnail Url 10607 non-null object 29 Host Picture Url 10607 non-null object 30 Host Neighbourhood 7986 non-null object 31 Host Listings Count 10607 non-null float64 32 Host Total Listings Count 10607 non-null float64 33 Host Verifications 10604 non-null object 34 Street 10609 non-null object 35 Neighbourhood 7029 non-null object 36 Neighbourhood Cleansed 10609 non-null object 37 Neighbourhood Group Cleansed 10609 non-null object 38 City 10608 non-null object 39 State 10572 non-null object 40 Zipcode 10252 non-null float64 41 Market 10568 non-null object 42 Smart Location 10609 non-null object 43 Country Code 10609 non-null object 44 Country 10609 non-null object 45 Latitude 10609 non-null float64 46 Longitude 10609 non-null float64 47 Property Type 10609 non-null object 48 Room Type 10609 non-null object 49 Accommodates 10609 non-null int64 50 Bathrooms 10571 non-null float64 51 Bedrooms 10595 non-null float64 52 Beds 10576 non-null float64 53 Bed Type 10609 non-null object 54 Amenities 10528 non-null object 55 Square Feet 408 non-null float64 56 Price 10601 non-null float64 57 Weekly Price 2707 non-null float64 58 Monthly Price 2660 non-null float64 59 Security Deposit 4538 non-null float64 60 Cleaning Fee 6262 non-null float64 61 Guests Included 10609 non-null int64 62 Extra People 10609 non-null int64 63 Minimum Nights 10609 non-null int64 64 Maximum Nights 10609 non-null int64 65 Calendar Updated 10609 non-null object 66 Has Availability 0 non-null float64 67 Availability 30 10609 non-null int64 68 Availability 60 10609 non-null int64 69 Availability 90 10609 non-null int64 70 Availability 365 10609 non-null int64 71 Calendar last Scraped 10609 non-null object 72 Number of Reviews 10609 non-null int64 73 First Review 8419 non-null object 74 Last Review 8418 non-null object 75 Review Scores Rating 8325 non-null float64 76 Review Scores Accuracy 8312 non-null float64 77 Review Scores Cleanliness 8316 non-null float64 78 Review Scores Checkin 8302 non-null float64 79 Review Scores Communication 8314 non-null float64 80 Review Scores Location 8300 non-null float64 81 Review Scores Value 8299 non-null float64 82 License 195 non-null object 83 Jurisdiction Names 0 non-null float64 84 Cancellation Policy 10609 non-null object 85 Calculated host listings count 10609 non-null float64 86 Reviews per Month 8419 non-null float64 87 Geolocation 10609 non-null object 88 Features 10609 non-null object dtypes: float64(27), int64(13), object(49) memory usage: 7.2+ MB
data_train.describe()
| ID | Scrape ID | Host ID | Host Response Rate | Host Acceptance Rate | Host Listings Count | Host Total Listings Count | Zipcode | Latitude | Longitude | Accommodates | Bathrooms | Bedrooms | Beds | Square Feet | Price | Weekly Price | Monthly Price | Security Deposit | Cleaning Fee | Guests Included | Extra People | Minimum Nights | Maximum Nights | Has Availability | Availability 30 | Availability 60 | Availability 90 | Availability 365 | Number of Reviews | Review Scores Rating | Review Scores Accuracy | Review Scores Cleanliness | Review Scores Checkin | Review Scores Communication | Review Scores Location | Review Scores Value | Jurisdiction Names | Calculated host listings count | Reviews per Month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1.060900e+04 | 1.060900e+04 | 1.060900e+04 | 9292.000000 | 0.0 | 10607.000000 | 10607.000000 | 10252.000000 | 10609.000000 | 10609.000000 | 10609.000000 | 10571.000000 | 10595.000000 | 10576.000000 | 408.000000 | 10601.000000 | 2707.000000 | 2660.000000 | 4538.000000 | 6262.000000 | 10609.000000 | 10609.000000 | 10609.000000 | 10609.000000 | 0.0 | 10609.000000 | 10609.000000 | 10609.000000 | 10609.000000 | 10609.000000 | 8325.00000 | 8312.000000 | 8316.000000 | 8302.000000 | 8314.000000 | 8300.000000 | 8299.000000 | 0.0 | 10609.000000 | 8419.000000 |
| mean | 1.040110e+07 | 2.017041e+13 | 3.750421e+07 | 94.870319 | NaN | 9.807674 | 9.807674 | 28015.454936 | 40.420452 | -3.697172 | 3.187859 | 1.256409 | 1.293535 | 1.989883 | 350.269608 | 65.693708 | 368.510528 | 1334.361654 | 182.956148 | 29.681731 | 1.575832 | 7.494392 | 3.014045 | 980.265906 | NaN | 8.673862 | 22.770384 | 39.874729 | 205.858799 | 23.278443 | 91.61958 | 9.404475 | 9.326359 | 9.620453 | 9.644215 | 9.545301 | 9.214002 | NaN | 7.661891 | 1.945315 |
| std | 5.496576e+06 | 4.347861e+00 | 3.445434e+07 | 15.140439 | NaN | 27.624553 | 27.624553 | 20.158532 | 0.020082 | 0.023733 | 1.996570 | 0.611445 | 0.831415 | 1.531881 | 499.223019 | 56.138300 | 190.596962 | 886.872846 | 107.625972 | 26.062273 | 1.081050 | 10.873778 | 12.840494 | 9908.895741 | NaN | 9.033276 | 19.406728 | 29.213100 | 126.815462 | 38.482792 | 8.93289 | 0.923767 | 0.984319 | 0.788198 | 0.760949 | 0.754787 | 0.944337 | NaN | 19.746585 | 1.905967 |
| min | 1.862800e+04 | 2.017041e+13 | 1.745300e+04 | 0.000000 | NaN | 0.000000 | 0.000000 | 28001.000000 | 40.331888 | -3.863907 | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 9.000000 | 70.000000 | 250.000000 | 70.000000 | 4.000000 | 1.000000 | 0.000000 | 1.000000 | 1.000000 | NaN | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 20.00000 | 2.000000 | 2.000000 | 2.000000 | 2.000000 | 2.000000 | 2.000000 | NaN | 1.000000 | 0.020000 |
| 25% | 5.820149e+06 | 2.017041e+13 | 7.823688e+06 | 100.000000 | NaN | 1.000000 | 1.000000 | 28005.000000 | 40.410002 | -3.707759 | 2.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 30.000000 | 200.000000 | 700.000000 | 100.000000 | 15.000000 | 1.000000 | 0.000000 | 1.000000 | 365.000000 | NaN | 0.000000 | 5.000000 | 13.000000 | 84.000000 | 1.000000 | 89.00000 | 9.000000 | 9.000000 | 9.000000 | 9.000000 | 9.000000 | 9.000000 | NaN | 1.000000 | 0.480000 |
| 50% | 1.154498e+07 | 2.017041e+13 | 2.746121e+07 | 100.000000 | NaN | 2.000000 | 2.000000 | 28012.000000 | 40.418568 | -3.701573 | 2.000000 | 1.000000 | 1.000000 | 2.000000 | 86.000000 | 52.000000 | 350.000000 | 1200.000000 | 150.000000 | 25.000000 | 1.000000 | 5.000000 | 2.000000 | 1125.000000 | NaN | 6.000000 | 20.000000 | 38.000000 | 247.000000 | 8.000000 | 94.00000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 9.000000 | NaN | 2.000000 | 1.300000 |
| 75% | 1.532662e+07 | 2.017041e+13 | 5.763190e+07 | 100.000000 | NaN | 5.000000 | 5.000000 | 28017.000000 | 40.427649 | -3.693854 | 4.000000 | 1.000000 | 2.000000 | 2.000000 | 592.000000 | 80.000000 | 490.000000 | 1666.000000 | 200.000000 | 35.000000 | 2.000000 | 15.000000 | 3.000000 | 1125.000000 | NaN | 13.000000 | 37.000000 | 65.000000 | 321.000000 | 28.000000 | 98.00000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | NaN | 4.000000 | 2.910000 |
| max | 1.810984e+07 | 2.017041e+13 | 1.247534e+08 | 100.000000 | NaN | 265.000000 | 265.000000 | 28850.000000 | 40.562736 | -3.526821 | 16.000000 | 8.000000 | 10.000000 | 16.000000 | 2691.000000 | 875.000000 | 995.000000 | 6990.000000 | 990.000000 | 500.000000 | 15.000000 | 276.000000 | 1125.000000 | 1000000.000000 | NaN | 30.000000 | 60.000000 | 90.000000 | 365.000000 | 446.000000 | 100.00000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 | NaN | 145.000000 | 17.210000 |
Columnas que parecen de menor interés por el tipo de dato o la información que contiene:
La exclusión de estas columnas se fundamenta en la relevancia, la calidad de los datos, y la alineación con los objetivos del análisis. El eliminado de características se realizará más adelante.
missing_values = data_train.isnull().sum()
# Filtra las columnas que tienen valores faltantes, ordenados de mayor a menor
missing_values_filtered = missing_values[missing_values > 0].sort_values(ascending=True)
fig, ax = plt.subplots(figsize=(10, 14))
ax.barh(missing_values_filtered.index, missing_values_filtered.values, color='lightseagreen')
for s in ['top', 'bottom', 'left', 'right']:
ax.spines[s].set_visible(False)
# Remove x, y Ticks
ax.xaxis.set_ticks_position('none')
ax.yaxis.set_ticks_position('none')
# Add padding between axes and labels
ax.xaxis.set_tick_params(pad=5)
ax.yaxis.set_tick_params(pad=5)
# Valor barra
for i in ax.patches:
plt.text(i.get_width()+0.2, i.get_y()+0.5,
str(round((i.get_width()), 2)),
fontsize=10, fontweight='500',
color='lightslategrey', va='center')
ax.set_title('Número de Valores Faltantes por Columna', fontsize=18, fontweight='500')
plt.show()
data_train.hist(bins=20, figsize=(24,16), color='teal')
plt.show()
Sesgo en la distribución: Muchas de las variables presentan una distribución sesgada, lo que significa que la mayoría de los valores se concentran en un extremo de la distribución. Por ejemplo, Security Deposit, Extra People, Minimum Nights, Maximum Nights y Cleaning Fee muestran un sesgo positivo claro, con muchos valores bajos y una cola larga hacia valores más altos.
Variables con valores cero predominantes: Algunas variables como Square Feet, Review Scores Rating, y Number of Reviews tienen una cantidad significativa de ceros o valores bajos, lo que podría sugerir que muchos listados no proporcionan esta información o que hay un gran número de listados nuevos o menos populares.
Outliers: Las columnas Price, Security Deposit, Cleaning Fee, Number of Reviews, Minimum Nights, Maximum Nights y Extra People parecen tener valores atípicos (por la presencia de barras individuales alejadas de la mayoría de los datos). También deberíamos analizar las columnas Bathrooms, Bedrooms y Beds.
Gráficas en blanco: Las gráficas que aparecen en blanco o prácticamente sin datos visibles pueden sugerir valores constantes o únicos, valores mayoritariamente ceros, escalas desbalanceadas o datos faltantes.
Columnas numéricas que podrían estar relacionadas con el precio:
Cleaning Fee y Security Deposit: Alojamientos más caros podrían tener tasas de limpieza y depósitos de seguridad más altos.
Accommodate, Bedrooms, Bathrooms, Beds: Estas características suelen estar fuertemente relacionadas con el precio, ya que indican el tamaño y la capacidad del alojamiento.
Minimum Nights y Maximum Nights: Podrían influir en el precio en términos de descuentos por estancias largas o políticas de estancia mínima.
Review Scores Rating y subcategorías de Review Scores: Las puntuaciones más altas podrían permitir a los anfitriones cobrar más por sus alojamientos debido a la percepción de mayor calidad o experiencia.
Vamos a comprobarlo...
# Calculamos la matriz de correlación para data_train
corr_matrix = data_train.select_dtypes(include=[np.number]).corr()
# Gráfico de calor para la matriz de correlación
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix)
plt.title('Matriz de Correlación', fontsize=15)
plt.show()
# Gráfico de las correlaciones de la columna 'Price' con las demás
corr_with_price = corr_matrix["Price"].sort_values(ascending=False).drop('Price') # Excluir la propia correlación de 'Price' con sí misma
plt.figure(figsize=(14, 10))
sns.barplot(x=corr_with_price.values, y=corr_with_price.index)
plt.title('Correlación con Price', fontsize=16)
plt.xlabel('Coeficiente de Correlación')
plt.ylabel('')
plt.show()
Si queremos preparar un modelo de predicción de precios, vamos a priorizar las características que tienen una correlación más fuerte con la variable objetivo (Price). Sin embargo, tendremos en cuenta que la correlación no implica causalidad y que las correlaciones bajas no necesariamente significan que una característica no sea importante; podría influir en el precio de manera no lineal o interactuar con otras variables de formas que no se capturan con la correlación lineal simple. Algunas variables con baja correlación o correlación negativa podrían todavía ser útiles al combinarse con otras o al formar interacciones en el modelo.
# Función para crear histogramas de múltiples columnas
def plot_histograms(df, columns, bins=20, color='royalblue', figsize=(20, 5)):
fig, axs = plt.subplots(nrows=1, ncols=len(columns), figsize=figsize)
# Asegura que axs sea un array, incluso con una sola columna
if len(columns) == 1:
axs = [axs]
# Iterar a través de las columnas y los ejes para crear histogramas
for col, ax in zip(columns, axs):
df[col].dropna().hist(bins=bins, ax=ax, color=color)
ax.set_title(col)
ax.set_xlabel('Valor')
ax.set_ylabel('Frecuencia')
# Elimina la cuadrícula y los bordes
ax.grid(False)
for spine in ax.spines.values():
spine.set_visible(False)
plt.tight_layout()
plt.show()
from pandas.plotting import scatter_matrix
# Función para crear una matriz de dispersión para atributos seleccionados
def plot_scatter_matrix(df, attributes, figsize=(14, 10), diagonal='kde',
color='royalblue', alpha=0.7, s=100, edgecolors='white',
suptitle='Matriz de Dispersión'):
sns.set(style="whitegrid")
scatter_matrix(df[attributes], figsize=figsize, diagonal=diagonal,
color=color, alpha=alpha, s=s, edgecolors=edgecolors)
plt.suptitle(suptitle, size=16)
plt.subplots_adjust(top=0.94, wspace=0.05, hspace=0.1)
plt.show()
columns_to_include = ['Host ID', 'Host Response Rate', 'Host Listings Count', 'Host Total Listings Count']
plot_histograms(data_train, columns_to_include, bins=15, figsize=(20, 5))
print(data_train['Host Response Time'].value_counts())
Host Response Time within an hour 5698 within a few hours 2168 within a day 1217 a few days or more 209 Name: count, dtype: int64
print(data_train['Host Listings Count'].describe())
count 10607.000000 mean 9.807674 std 27.624553 min 0.000000 25% 1.000000 50% 2.000000 75% 5.000000 max 265.000000 Name: Host Listings Count, dtype: float64
# Analizamos los registros que tienen un valor superior a 100 en 'Host Listings Count'
high_listing_hosts = data_train[data_train['Host Listings Count'] > 100]
host_counts = high_listing_hosts['Host Name'].value_counts()
print(f"Número de 'Host Name' únicos con más de 100 listados: {host_counts.shape[0]}")
print(f"Cantidad de veces que cada anfitrión aparece con más de 100 listados: {host_counts}")
# Agrupamos por "Host Name" y obtenemos el valor máximo de "Host Listings Count" para cada grupo
host_listings_max = high_listing_hosts.groupby('Host Name')['Host Listings Count'].max()
print(f"Valor máximo de cada {host_listings_max}")
Número de 'Host Name' únicos con más de 100 listados: 6 Cantidad de veces que cada anfitrión aparece con más de 100 listados: Host Name Raquel 118 Erasmo´S 75 Spain Select 65 Javier 53 Teresa 3 Loic 1 Name: count, dtype: int64 Valor máximo de cada Host Name Erasmo´S 114.0 Javier 207.0 Loic 265.0 Raquel 164.0 Spain Select 142.0 Teresa 136.0 Name: Host Listings Count, dtype: float64
# Correlación entre variables
columns_of_interest = ['Host ID', 'Host Total Listings Count','Host Listings Count','Host Response Rate','Price']
correlation_matrix = data_train[columns_of_interest].corr()
plt.figure(figsize=(5, 3))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', cbar=True)
plt.show()
La matriz de correlación sugiere que las características del anfitrión tienen una relación débil con el precio. La correlación negativa moderada entre el tiempo de respuesta del anfitrión y su tasa de respuesta resalta que los anfitriones que tardan más en responder tienden a tener tasas de respuesta más bajas. A pesar de las bajas correlaciones generales, es importante valorar la información única que cada variable aporta antes de su posible exclusión.
Con una correlación máxima de 0.19 entre el precio y el número de listados del anfitrión, el impacto directo de las características del anfitrión en el precio es limitado. No obstante, vamos a explorar otras características más allá de las del anfitrión para determinar cuáles influirán significativamente en el precio y cuáles podrán ser omitidas en el modelado final.
# Gráfico de dispersión
attributes = ['Host ID', 'Host Listings Count', 'Host Response Rate', 'Price']
plot_scatter_matrix(data_train, attributes)
Columnas que no se incluirán en el modelo:
Host ID, correlación baja; poco o ningún efecto lineal sobre el precioHost Name, puede haber diferentes anfitriones con el mismo nombre y no es considerada una variable que esté relacionada con el precioHost Location, información menos relevante o inespecíficaHost Neighbourhood, información menos relevanteHost About, requeriría procesamiento o análisis de textoHost Total Listings Count, tiene los mismos valores que Host Listings Count (corr 1)Experiences Offered, todos los registros tienen el valor 'none' (no aporta información)Columnas de texto/categóricas que se convertirán a categorías numéricas:
Host Response Time, se asignarán valores del 1-4 al tratarse de una categoría ordinalHost Verification, indicará la cantidad total de verificacionesColumnas numéricas que se mantienen:
Host Listings Count, se aplicará transformación; cola largacolumns_to_include =['Latitude', 'Longitude' ,'Zipcode']
plot_histograms(data_train, columns_to_include, bins=15, figsize=(20, 5))
data_train[['City', 'State', 'Country Code', 'Country']].describe()
| City | State | Country Code | Country | |
|---|---|---|---|---|
| count | 10608 | 10572 | 10609 | 10609 |
| unique | 19 | 15 | 1 | 1 |
| top | Madrid | Comunidad de Madrid | ES | Spain |
| freq | 10565 | 8520 | 10609 | 10609 |
data_train[['Neighbourhood Cleansed', 'Neighbourhood Group Cleansed']].describe()
| Neighbourhood Cleansed | Neighbourhood Group Cleansed | |
|---|---|---|
| count | 10609 | 10609 |
| unique | 126 | 21 |
| top | Embajadores | Centro |
| freq | 1471 | 5436 |
# Creamos un boxplot de precios por distrito
sns.set(style="whitegrid", palette="pastel")
plt.figure(figsize=(15, 7))
sns.boxplot(x='Neighbourhood Group Cleansed', y='Price', data=data_train, width=0.6, fliersize=5)
plt.title('Distribución de Precios por Distrito en Madrid', fontsize=16, weight=500)
plt.xlabel('Distrito', fontsize=14)
plt.ylabel('Precio', fontsize=14)
plt.xticks(rotation=90, ha='right')
sns.despine(trim=True)
plt.subplots_adjust(top=0.96)
plt.show()
Tras la revisión anterior, decido mantener temporalmente tanto la columna Neighbourhood Cleansed (barrios) como Neighbourhood Group Cleansed (distritos) en el análisis. Posteriormente, en función de los resultados obtenidos en análisis más avanzados, como la correlación entre variables y otros métodos estadísticos, tomaré una decisión informada sobre cuál de las dos columnas es más pertinente mantener para el análisis final (o ninguna).
np.random.seed(0)
def safe_mean(data):
if len(data) == 0:
return np.nan # Retorna NaN si el conjunto está vacío
return np.nanmean(data) # Utiliza nanmean para ignorar los NaN en el cálculo
plt.figure(figsize=(10, 6))
hb = plt.hexbin(data_train['Longitude'], data_train['Latitude'],
C=data_train['Price'], gridsize=50, cmap='rocket_r',
reduce_C_function=safe_mean, edgecolors='none')
cb = plt.colorbar(hb, label='Precio medio por noche (€)')
plt.xlabel('Longitud', fontsize=12, fontweight='light')
plt.ylabel('Latitud', fontsize=12, fontweight='light')
plt.title('Mapa de calor de precios por noche en Madrid', fontsize=14, fontweight='500')
sns.despine(top=True, right=True, left=False, bottom=False)
plt.show()
correlation_matrix = data_train[['Latitude','Longitude','Zipcode','Price']].corr()
plt.figure(figsize=(5, 4))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', cbar=True)
plt.show()
Nuevas columnas que se plantean:
LatLongInteraction, refleja la multiplicacion de longitud y latitud Columnas que no se incluirán en el modelo:
Neighbourhood, 33,7% datos faltantes e información redundanteStree, City, State, Country Code, Country, Market, Smart Location, información no relevanteLatitude, Longitude, información redundante al tener LatLongInteractionZipcode, la información geográfica relevante ya está cubierta por otra columna y el código postal numérico no aporta valor significativo al modeloColumnas de texto/categóricas que se convertirán a categorías numéricas:
Neighbourhood Cleansed Neighbourhood Group Cleansedcolumns_to_include = ['Accommodates','Bathrooms','Bedrooms','Beds','Square Feet']
plot_histograms(data_train, columns_to_include, bins=15, figsize=(24, 5))
sns.set(style="whitegrid")
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(20, 6))
# Gráfico de barras para 'Property Type'
data_train['Property Type'].value_counts().plot(kind='bar', color='darkslategray', ax=axes[0])
axes[0].set_title('Distribución de Tipos de Propiedades')
axes[0].set_xlabel('Tipo de Propiedad')
axes[0].set_ylabel('Frecuencia')
axes[0].tick_params(axis='x', rotation=90)
# Gráfico de barras para 'Room Type'
data_train['Room Type'].value_counts().plot(kind='bar', color='darkslategray', ax=axes[1])
axes[1].set_title('Distribución de Tipos de Habitaciones')
axes[1].set_xlabel('Tipo de Habitación')
axes[1].set_ylabel('Frecuencia')
axes[1].tick_params(axis='x', rotation=0)
# Gráfico de barras para 'Bed Type'
data_train['Bed Type'].value_counts().plot(kind='bar', color='darkslategray', ax=axes[2])
axes[2].set_title('Distribución de Tipos de Cama')
axes[2].set_xlabel('Tipo de Cama')
axes[2].set_ylabel('Frecuencia')
axes[2].tick_params(axis='x', rotation=0)
sns.despine(trim=True)
plt.tight_layout()
plt.show()
sns.set_style("whitegrid")
plt.figure(figsize=(16, 10))
# Gráfico de cajas para 'Property Type' y 'Price'
plt.subplot(1, 3, 1)
sns.boxplot(x='Property Type',y='Price',data=data_train,palette="magma",hue='Property Type',legend=False)
plt.xticks(rotation=45, ha='right')
plt.title('Distribución de Precios por Tipo de Propiedad')
plt.xlabel('Tipo de Propiedad')
plt.ylabel('Precio')
# Gráfico de cajas para 'Room Type' y 'Price'
plt.subplot(1, 3, 2)
sns.boxplot(x='Room Type',y='Price',data=data_train,palette="magma",hue='Room Type',legend=False)
plt.xticks(rotation=45, ha='right')
plt.title('Distribución de Precios por Tipo de Habitación')
plt.xlabel('Tipo de Habitación')
plt.ylabel('Precio')
# Gráfico de cajas para 'Bed Type' y 'Price'
plt.subplot(1, 3, 3)
sns.boxplot(x='Bed Type',y='Price',data=data_train,palette="magma",hue='Bed Type',legend=False)
plt.xticks(rotation=45, ha='right')
plt.title('Distribución de Precios por Tipo de Cama')
plt.xlabel('Tipo de Cama')
plt.ylabel('Precio')
sns.despine(trim=True)
plt.tight_layout(pad=2)
plt.show()
# Scatter Plot
sns.set_style("whitegrid")
g = sns.FacetGrid(data_train, col="Room Type", hue="Property Type", col_wrap=3, height=4)
g.map(sns.scatterplot, "Bedrooms", "Price")
g.add_legend()
plt.show()
# Scatter Plot
sns.set_style("whitegrid")
g = sns.FacetGrid(data_train, col="Room Type", hue="Property Type", col_wrap=3, height=4)
g.map(sns.scatterplot, "Bathrooms", "Price")
g.add_legend()
plt.show()
# Pair Plot
sns.pairplot(data_train, hue="Room Type",
vars=["Price", "Bathrooms", "Bedrooms", "Beds"],
corner=True, palette="muted")
plt.show()
sns.set_style("whitegrid")
plt.figure(figsize=(15, 3))
plt.subplot(1, 3, 1)
sns.boxplot(data=data_train, x='Bathrooms', color="cornflowerblue")
plt.title('Distribución de Baños')
plt.subplot(1, 3, 2)
sns.boxplot(data=data_train, x='Bedrooms', color="cornflowerblue")
plt.title('Distribución de Dormitorios')
sns.despine(trim=True)
plt.subplot(1, 3, 3)
sns.boxplot(data=data_train, x='Beds', color="cornflowerblue")
plt.title('Distribución de Camas')
sns.despine(top=True, right=True, left=False, bottom=False)
plt.tight_layout()
plt.show()
sns.set_style("whitegrid")
plt.figure(figsize=(18, 8))
plt.subplot(1, 3, 1)
sns.boxplot(x='Bathrooms', y='Price', data=data_train, color="cornflowerblue")
plt.title('Distribución de Precios por Número de Baños')
plt.xlabel('Tipo de Propiedad')
plt.ylabel('Precio')
plt.subplot(1, 3, 2)
sns.boxplot(x='Bedrooms', y='Price', data=data_train, color="cornflowerblue")
plt.title('Distribución de Precios por Número de Dormitorios')
plt.xlabel('Tipo de Habitación')
plt.ylabel('Precio')
plt.subplot(1, 3, 3)
sns.boxplot(x='Beds', y='Price', data=data_train, color="cornflowerblue")
plt.title('Distribución de Precios por Número de Camas')
plt.xlabel('Tipo de Cama')
plt.ylabel('Precio')
sns.despine(trim=True)
plt.tight_layout()
plt.show()
La presencia de Private Room y Shared Room en la columna "Room Type" da una exploración pausible para algunos de los valores atípicos en la cantidad de baños y dormitorios. Por ejemplo, una propiedad podría tener un alto número de baños disponibles porque se trata de un hostal o un bed & breakfast donde cada habitación privada o compartida tiene acceso a su propio baño, aunque el número de dormitorios sea bajo.
print(f"Registros con 10 dormitorios:\n{data_train[data_train['Bedrooms'] == 10][['Bathrooms', 'Beds', 'Price', 'Property Type']]}")
print(f"\nRegistros con 0 dormitorios:\n{data_train[data_train['Bedrooms'] == 0][['Bathrooms', 'Beds', 'Price', 'Property Type']]}")
print(f"\nRegistros con 7 baños o más:\n{data_train[data_train['Bathrooms'] >= 7][['Bathrooms', 'Beds', 'Price', 'Property Type']]}")
Registros con 10 dormitorios:
Bathrooms Beds Price Property Type
82 6.0 16.0 40.0 Bed & Breakfast
1782 5.5 1.0 40.0 House
3859 6.0 16.0 180.0 Apartment
5557 6.5 16.0 850.0 Apartment
7916 0.0 2.0 39.0 House
Registros con 0 dormitorios:
Bathrooms Beds Price Property Type
42 1.0 1.0 60.0 Apartment
53 1.0 1.0 53.0 Apartment
77 1.0 2.0 66.0 Loft
126 1.0 3.0 55.0 Apartment
132 1.0 3.0 43.0 Serviced apartment
138 1.0 1.0 135.0 Apartment
142 1.0 2.0 69.0 Apartment
167 1.0 1.0 51.0 Apartment
170 1.0 1.0 59.0 Apartment
188 1.0 2.0 69.0 Apartment
192 1.0 1.0 69.0 Apartment
197 1.0 2.0 100.0 Loft
199 1.0 1.0 30.0 Apartment
227 1.0 1.0 50.0 Apartment
236 1.0 4.0 70.0 Apartment
242 1.0 2.0 100.0 Apartment
255 1.0 1.0 25.0 Apartment
307 1.0 1.0 45.0 Apartment
311 1.0 1.0 50.0 Condominium
320 1.0 2.0 60.0 Apartment
341 1.0 1.0 74.0 Apartment
346 1.0 1.0 40.0 Apartment
352 1.0 1.0 55.0 Apartment
366 1.0 1.0 59.0 Apartment
376 1.0 2.0 65.0 Apartment
404 1.0 1.0 70.0 Apartment
407 1.0 1.0 70.0 Apartment
446 1.0 2.0 66.0 Apartment
517 1.0 2.0 65.0 Apartment
553 1.0 2.0 50.0 Apartment
557 1.0 2.0 50.0 House
582 1.0 1.0 70.0 Apartment
595 1.0 1.0 35.0 Apartment
622 1.0 2.0 40.0 Apartment
623 1.0 1.0 60.0 Loft
636 1.0 2.0 64.0 Apartment
649 1.0 1.0 45.0 Apartment
671 1.0 1.0 35.0 Apartment
682 1.0 1.0 65.0 Loft
709 1.0 1.0 70.0 Apartment
716 1.0 2.0 75.0 Apartment
725 1.0 1.0 50.0 Apartment
730 1.0 1.0 29.0 Apartment
731 1.0 1.0 80.0 Apartment
748 1.0 1.0 45.0 Loft
784 2.0 3.0 133.0 Apartment
786 1.0 1.0 30.0 Apartment
791 1.0 1.0 115.0 Loft
804 1.0 2.0 59.0 Apartment
811 1.0 1.0 71.0 Apartment
822 1.0 1.0 57.0 Apartment
839 1.0 1.0 35.0 Apartment
859 1.0 1.0 60.0 Apartment
883 1.0 1.0 22.0 Loft
896 1.0 2.0 71.0 Apartment
904 1.0 2.0 47.0 Condominium
909 1.0 2.0 50.0 Apartment
918 1.0 1.0 60.0 Apartment
944 1.0 1.0 40.0 Apartment
964 1.0 1.0 80.0 Apartment
971 1.0 1.0 40.0 Apartment
976 1.0 1.0 13.0 Apartment
996 1.0 1.0 55.0 Apartment
1005 1.0 2.0 44.0 Loft
1026 1.0 1.0 58.0 Apartment
1027 1.0 1.0 55.0 Apartment
1043 1.0 1.0 65.0 Apartment
1060 1.0 1.0 30.0 Apartment
1063 1.0 2.0 50.0 Apartment
1085 1.0 1.0 60.0 Apartment
1118 1.0 2.0 76.0 Loft
1134 1.0 1.0 40.0 Apartment
1153 1.0 2.0 45.0 Apartment
1168 1.0 2.0 42.0 Apartment
1171 1.0 1.0 42.0 Apartment
1187 1.0 1.0 53.0 Apartment
1191 1.0 1.0 46.0 Apartment
1192 1.0 2.0 75.0 Apartment
1215 1.0 2.0 65.0 Apartment
1249 1.0 1.0 75.0 Apartment
1264 1.0 1.0 50.0 Apartment
1282 1.0 1.0 150.0 Apartment
1303 1.0 1.0 55.0 Apartment
1314 1.0 2.0 89.0 Apartment
1317 1.0 1.0 50.0 Loft
1332 1.0 1.0 76.0 Apartment
1333 1.0 1.0 35.0 Apartment
1370 1.0 1.0 49.0 Apartment
1383 1.0 2.0 40.0 Serviced apartment
1388 1.0 2.0 60.0 Apartment
1392 1.0 1.0 35.0 Apartment
1421 1.0 2.0 46.0 Apartment
1425 1.0 2.0 65.0 Loft
1427 1.0 1.0 28.0 Condominium
1429 1.0 2.0 65.0 Apartment
1435 1.0 2.0 45.0 Apartment
1458 1.0 2.0 50.0 Loft
1476 1.0 2.0 65.0 Apartment
1485 1.0 1.0 40.0 Condominium
1493 1.0 1.0 39.0 Apartment
1502 1.0 1.0 60.0 Apartment
1516 1.0 2.0 36.0 Apartment
1524 1.0 2.0 90.0 Apartment
1550 1.0 1.0 39.0 Apartment
1595 1.0 2.0 50.0 Apartment
1598 1.0 1.0 70.0 Apartment
1603 1.0 3.0 35.0 Apartment
1609 1.0 2.0 60.0 Loft
1635 1.0 1.0 70.0 Loft
1643 1.0 1.0 48.0 Apartment
1655 1.0 1.0 59.0 Apartment
1660 1.0 1.0 69.0 Apartment
1671 1.0 1.0 119.0 Apartment
1682 1.0 1.0 45.0 Apartment
1704 1.0 1.0 49.0 Apartment
1733 1.0 2.0 43.0 Apartment
1759 1.0 1.0 28.0 Apartment
1788 1.0 1.0 50.0 Apartment
1798 1.0 1.0 37.0 Apartment
1815 1.0 1.0 60.0 Apartment
1816 1.0 2.0 55.0 Apartment
1818 1.0 2.0 60.0 Apartment
1830 1.0 2.0 80.0 Apartment
1831 1.0 1.0 33.0 Apartment
1832 1.0 1.0 35.0 Condominium
1838 1.0 1.0 48.0 Apartment
1854 1.0 2.0 50.0 Apartment
1878 1.0 2.0 46.0 Apartment
1880 1.0 1.0 45.0 Loft
1885 1.0 2.0 75.0 Apartment
1890 1.0 1.0 109.0 Apartment
1897 1.0 4.0 55.0 Other
1902 1.0 1.0 31.0 Apartment
1918 1.0 1.0 40.0 Apartment
1929 1.0 1.0 40.0 Apartment
1941 1.0 1.0 30.0 Apartment
1944 1.0 1.0 80.0 Apartment
1957 1.0 2.0 72.0 Apartment
1978 1.0 1.0 49.0 Apartment
1985 1.0 2.0 45.0 Apartment
1996 1.0 1.0 40.0 Apartment
1997 1.0 2.0 42.0 Apartment
2027 1.0 1.0 85.0 Apartment
2033 1.0 1.0 43.0 Apartment
2058 1.0 1.0 60.0 Apartment
2066 1.0 1.0 45.0 Apartment
2072 1.0 2.0 65.0 Apartment
2075 1.0 1.0 49.0 Apartment
2084 1.0 1.0 60.0 Loft
2090 1.0 1.0 40.0 Apartment
2093 1.0 1.0 50.0 Loft
2095 1.0 2.0 95.0 Apartment
2142 1.0 1.0 55.0 Apartment
2167 1.0 1.0 32.0 Serviced apartment
2172 1.0 1.0 50.0 Apartment
2183 1.0 2.0 50.0 Apartment
2189 1.0 1.0 50.0 Apartment
2199 1.0 2.0 200.0 Apartment
2222 1.0 1.0 55.0 Apartment
2233 1.0 1.0 50.0 Apartment
2255 1.0 2.0 49.0 Apartment
2260 1.0 1.0 70.0 Apartment
2275 1.0 1.0 60.0 Apartment
2285 2.0 1.0 39.0 Apartment
2297 1.0 1.0 30.0 Apartment
2302 1.0 1.0 69.0 Apartment
2313 1.0 1.0 50.0 Loft
2315 1.0 2.0 50.0 Apartment
2330 1.0 1.0 60.0 Apartment
2343 1.0 2.0 50.0 Apartment
2344 1.0 2.0 70.0 Condominium
2356 1.0 1.0 47.0 Apartment
2380 1.0 2.0 53.0 Apartment
2382 1.0 1.0 60.0 Apartment
2407 1.0 1.0 70.0 Apartment
2420 1.0 2.0 125.0 Apartment
2426 1.0 1.0 525.0 Apartment
2439 1.0 1.0 48.0 Apartment
2465 1.0 2.0 100.0 Apartment
2488 1.0 2.0 29.0 Apartment
2515 1.0 1.0 89.0 Apartment
2518 1.0 1.0 40.0 Apartment
2537 1.0 2.0 125.0 Apartment
2550 1.0 2.0 47.0 Apartment
2559 1.0 1.0 25.0 Apartment
2562 1.0 1.0 80.0 Apartment
2591 1.0 4.0 120.0 Apartment
2593 1.0 1.0 40.0 Apartment
2610 1.0 1.0 50.0 Apartment
2628 1.0 1.0 60.0 Apartment
2637 1.0 1.0 30.0 Apartment
2642 1.0 1.0 70.0 Apartment
2643 1.0 1.0 48.0 Apartment
2647 1.0 1.0 60.0 Apartment
2655 1.0 2.0 40.0 Apartment
2720 1.0 2.0 80.0 Apartment
2722 1.0 2.0 45.0 Apartment
2728 1.0 1.0 50.0 Apartment
2738 1.0 1.0 40.0 Apartment
2748 1.0 2.0 70.0 Apartment
2750 1.0 2.0 95.0 Loft
2788 1.0 3.0 67.0 Apartment
2797 1.0 1.0 55.0 Loft
2810 1.0 2.0 75.0 Apartment
2817 1.0 2.0 60.0 Apartment
2819 2.0 1.0 18.0 Apartment
2830 1.0 2.0 60.0 Apartment
2835 1.0 1.0 29.0 Apartment
2847 1.0 1.0 48.0 Apartment
2868 1.0 2.0 74.0 Apartment
2887 1.0 1.0 90.0 Apartment
2888 1.0 1.0 46.0 Apartment
2900 1.0 2.0 62.0 Loft
2933 1.0 1.0 43.0 Apartment
2954 1.0 1.0 89.0 Apartment
2959 1.0 1.0 55.0 Apartment
3001 1.0 2.0 50.0 Apartment
3004 1.0 1.0 43.0 Apartment
3020 1.0 1.0 36.0 Apartment
3022 1.0 1.0 40.0 Apartment
3025 1.0 2.0 65.0 Apartment
3038 1.0 2.0 49.0 Apartment
3059 1.0 1.0 38.0 Apartment
3060 1.0 1.0 30.0 Apartment
3072 1.0 1.0 29.0 Apartment
3081 1.0 1.0 54.0 Apartment
3100 1.0 1.0 69.0 Apartment
3118 1.0 1.0 45.0 Apartment
3148 1.0 3.0 89.0 Apartment
3151 1.0 1.0 50.0 Apartment
3163 1.0 2.0 60.0 Apartment
3172 1.0 1.0 45.0 Apartment
3185 1.0 1.0 40.0 Apartment
3207 2.0 4.0 100.0 Apartment
3218 1.0 4.0 59.0 Apartment
3243 1.0 3.0 90.0 Apartment
3256 1.0 1.0 38.0 Apartment
3262 1.0 1.0 45.0 Other
3272 1.0 2.0 75.0 Apartment
3279 1.0 3.0 55.0 Apartment
3290 1.0 2.0 60.0 Apartment
3322 1.0 1.0 100.0 Apartment
3325 1.0 2.0 64.0 Apartment
3338 1.0 1.0 65.0 Apartment
3340 1.0 1.0 75.0 Apartment
3367 2.0 2.0 80.0 Loft
3383 1.0 1.0 50.0 Apartment
3394 1.0 1.0 50.0 Apartment
3443 1.0 1.0 79.0 Loft
3444 1.0 1.0 50.0 Condominium
3447 1.0 2.0 89.0 Apartment
3470 1.0 1.0 30.0 Apartment
3480 1.0 1.0 45.0 Apartment
3509 1.0 1.0 45.0 Apartment
3515 1.0 1.0 33.0 Apartment
3554 1.0 1.0 53.0 Apartment
3586 1.0 1.0 98.0 Apartment
3598 1.0 1.0 89.0 Apartment
3616 1.0 2.0 76.0 Apartment
3626 1.0 2.0 50.0 Apartment
3628 1.0 1.0 63.0 Apartment
3630 1.0 1.0 60.0 Apartment
3646 1.0 2.0 49.0 Apartment
3650 1.0 1.0 41.0 Apartment
3656 1.0 2.0 60.0 House
3658 1.0 1.0 45.0 Apartment
3673 1.0 2.0 65.0 Apartment
3678 1.0 2.0 60.0 Apartment
3692 1.0 1.0 41.0 Apartment
3696 1.0 1.0 50.0 Other
3699 1.0 2.0 80.0 Apartment
3705 1.0 1.0 34.0 Apartment
3719 1.0 2.0 45.0 Apartment
3721 1.0 2.0 55.0 Apartment
3727 1.0 1.0 55.0 Apartment
3736 1.0 4.0 69.0 Apartment
3767 1.0 1.0 40.0 Apartment
3825 1.0 1.0 50.0 Loft
3838 1.0 2.0 50.0 Apartment
3847 1.0 2.0 45.0 Apartment
3857 1.0 2.0 42.0 Apartment
3869 1.0 1.0 70.0 Apartment
3881 1.0 1.0 55.0 Apartment
3889 1.0 3.0 65.0 Apartment
3899 1.0 1.0 49.0 Apartment
3923 1.0 2.0 40.0 Apartment
3928 1.0 1.0 90.0 Apartment
3976 1.0 1.0 43.0 Loft
3985 1.0 1.0 65.0 Apartment
3990 1.0 1.0 46.0 Apartment
4007 1.0 2.0 60.0 Apartment
4012 1.0 2.0 50.0 Apartment
4024 1.0 1.0 43.0 Loft
4029 1.0 1.0 54.0 Apartment
4083 1.0 1.0 60.0 Apartment
4152 1.0 1.0 45.0 Apartment
4156 1.0 1.0 75.0 Apartment
4164 1.0 1.0 30.0 Apartment
4165 1.0 2.0 50.0 Apartment
4171 1.0 2.0 75.0 Apartment
4184 1.0 1.0 43.0 Apartment
4189 1.0 2.0 30.0 Apartment
4191 1.0 2.0 57.0 Apartment
4202 1.0 2.0 50.0 Apartment
4215 1.0 1.0 50.0 Apartment
4216 1.0 2.0 50.0 Apartment
4220 1.0 1.0 42.0 Loft
4258 1.0 1.0 45.0 Apartment
4271 1.0 2.0 43.0 Loft
4316 1.0 2.0 50.0 Apartment
4331 1.0 1.0 36.0 Apartment
4333 1.0 2.0 50.0 Apartment
4337 1.0 1.0 60.0 Apartment
4361 1.0 2.0 97.0 Apartment
4366 1.0 1.0 60.0 Apartment
4368 1.0 1.0 70.0 Apartment
4381 1.0 1.0 49.0 Apartment
4387 1.0 2.0 95.0 Apartment
4392 1.0 1.0 76.0 Apartment
4452 1.0 1.0 45.0 Apartment
4455 1.0 2.0 69.0 Apartment
4461 1.0 1.0 50.0 Loft
4465 1.0 1.0 45.0 Apartment
4501 1.0 1.0 50.0 Condominium
4532 1.0 3.0 59.0 Apartment
4536 1.0 2.0 50.0 Apartment
4543 1.0 2.0 69.0 Condominium
4549 1.0 2.0 45.0 Apartment
4555 1.0 2.0 47.0 Apartment
4561 1.0 2.0 44.0 Condominium
4568 1.0 1.0 75.0 Loft
4619 1.0 1.0 65.0 Apartment
4628 1.0 1.0 74.0 Apartment
4638 1.0 2.0 49.0 Apartment
4649 1.0 1.0 42.0 Apartment
4654 1.0 2.0 75.0 Apartment
4660 1.0 2.0 60.0 House
4668 1.0 1.0 48.0 Apartment
4706 1.0 1.0 18.0 Apartment
4712 1.0 1.0 90.0 Apartment
4716 1.0 1.0 43.0 Apartment
4733 1.0 1.0 43.0 Apartment
4736 1.0 2.0 55.0 Loft
4737 1.0 2.0 89.0 Apartment
4761 1.0 1.0 80.0 Apartment
4784 1.0 1.0 49.0 Apartment
4814 1.0 1.0 80.0 Apartment
4827 1.0 1.0 84.0 Apartment
4831 1.0 1.0 47.0 Hostel
4846 1.0 2.0 70.0 Apartment
4848 1.0 1.0 49.0 Apartment
4876 1.0 2.0 50.0 Other
4892 1.0 3.0 70.0 Apartment
4903 1.0 1.0 35.0 Condominium
4909 1.0 2.0 59.0 Apartment
4919 1.0 1.0 39.0 Apartment
4921 1.0 3.0 55.0 Apartment
4926 1.0 1.0 45.0 Other
4927 1.0 2.0 64.0 Apartment
4940 1.0 1.0 79.0 Apartment
4960 1.0 1.0 14.0 Apartment
4980 1.0 1.0 80.0 Loft
5003 1.0 2.0 79.0 Apartment
5006 1.0 1.0 80.0 Apartment
5021 1.0 1.0 41.0 Apartment
5036 1.0 3.0 60.0 Apartment
5041 1.0 1.0 90.0 Apartment
5070 1.0 1.0 47.0 Apartment
5083 1.0 1.0 55.0 Apartment
5092 1.0 1.0 70.0 Apartment
5099 1.0 1.0 58.0 Apartment
5155 1.0 2.0 50.0 Apartment
5173 1.0 1.0 70.0 Apartment
5222 1.0 1.0 48.0 Apartment
5227 1.0 2.0 50.0 Apartment
5234 1.0 1.0 55.0 Apartment
5239 1.0 1.0 65.0 Apartment
5254 1.0 1.0 89.0 Apartment
5265 1.0 1.0 45.0 Apartment
5288 1.0 1.0 52.0 Apartment
5293 1.0 1.0 50.0 Apartment
5303 1.0 2.0 70.0 Apartment
5311 1.0 1.0 29.0 Apartment
5329 1.0 2.0 55.0 Apartment
5354 1.0 3.0 100.0 Condominium
5360 1.0 1.0 60.0 Apartment
5371 1.0 2.0 80.0 Apartment
5376 1.0 2.0 50.0 Apartment
5397 1.0 2.0 55.0 Loft
5405 1.0 1.0 55.0 Apartment
5435 1.0 1.0 43.0 Apartment
5518 1.0 1.0 25.0 Apartment
5520 1.0 2.0 50.0 Apartment
5535 1.0 1.0 99.0 Apartment
5546 1.0 1.0 79.0 Apartment
5567 1.0 1.0 39.0 Apartment
5573 1.0 1.0 49.0 Apartment
5588 1.0 1.0 35.0 Apartment
5593 1.0 1.0 50.0 Apartment
5596 1.0 1.0 47.0 Loft
5604 1.0 2.0 60.0 Apartment
5641 1.0 1.0 52.0 Apartment
5673 1.0 2.0 55.0 Apartment
5679 1.0 1.0 69.0 Apartment
5691 1.0 1.0 55.0 Apartment
5694 1.0 1.0 85.0 Apartment
5700 1.0 1.0 76.0 Apartment
5710 1.0 1.0 50.0 Apartment
5719 1.0 2.0 50.0 Apartment
5722 1.0 2.0 79.0 Apartment
5765 1.0 1.0 66.0 Apartment
5771 1.0 2.0 38.0 Loft
5796 1.0 1.0 32.0 Apartment
5798 1.0 2.0 45.0 Apartment
5799 1.0 1.0 60.0 Apartment
5800 1.0 1.0 90.0 Apartment
5811 1.0 1.0 60.0 Apartment
5822 1.0 1.0 50.0 Apartment
5828 1.0 2.0 50.0 Bed & Breakfast
5855 1.0 1.0 40.0 Apartment
5866 1.0 1.0 72.0 Apartment
5888 1.0 1.0 73.0 Apartment
5895 1.0 2.0 50.0 Apartment
5897 1.0 2.0 40.0 Apartment
5901 1.0 1.0 38.0 Other
5909 1.0 1.0 60.0 Apartment
5914 1.0 1.0 50.0 Apartment
5936 1.0 1.0 74.0 Apartment
5954 1.0 1.0 30.0 Apartment
6024 1.0 1.0 28.0 Apartment
6050 1.0 1.0 50.0 Condominium
6052 1.0 1.0 54.0 Loft
6069 1.0 1.0 44.0 Serviced apartment
6070 1.5 2.0 100.0 Apartment
6077 1.0 1.0 90.0 Apartment
6084 1.0 1.0 38.0 Apartment
6121 1.0 1.0 49.0 Apartment
6122 1.0 2.0 155.0 Apartment
6124 1.0 1.0 60.0 Apartment
6126 1.0 2.0 55.0 Condominium
6146 1.0 1.0 45.0 Apartment
6147 1.0 2.0 49.0 Apartment
6154 1.0 1.0 35.0 Apartment
6157 1.0 1.0 55.0 Apartment
6161 1.0 2.0 89.0 Apartment
6169 1.0 2.0 35.0 Apartment
6198 1.0 1.0 55.0 Apartment
6256 1.0 1.0 59.0 Apartment
6257 1.0 1.0 60.0 Apartment
6264 1.0 2.0 45.0 Loft
6278 1.0 1.0 55.0 Apartment
6281 1.0 1.0 35.0 Apartment
6294 2.0 1.0 40.0 Apartment
6299 1.0 1.0 50.0 Apartment
6307 1.0 1.0 34.0 Apartment
6309 1.0 1.0 70.0 Loft
6327 1.5 1.0 48.0 Loft
6350 1.0 1.0 40.0 Apartment
6352 1.0 2.0 42.0 Serviced apartment
6353 1.0 4.0 70.0 Apartment
6358 1.0 1.0 95.0 Apartment
6373 1.0 2.0 100.0 Apartment
6382 1.0 1.0 38.0 Apartment
6383 1.0 1.0 79.0 Apartment
6434 1.0 1.0 60.0 Apartment
6445 1.0 2.0 50.0 Apartment
6470 1.0 2.0 69.0 Apartment
6501 1.0 3.0 70.0 Apartment
6559 1.0 1.0 89.0 Apartment
6565 1.0 1.0 45.0 Apartment
6567 1.0 1.0 43.0 Other
6580 1.0 1.0 65.0 Apartment
6581 1.0 1.0 139.0 Apartment
6604 1.0 2.0 33.0 Apartment
6642 1.0 1.0 75.0 Apartment
6649 1.0 2.0 76.0 Apartment
6666 1.0 2.0 75.0 Apartment
6670 1.0 4.0 50.0 Apartment
6678 1.0 1.0 40.0 Apartment
6682 1.0 3.0 100.0 Apartment
6697 1.0 1.0 55.0 Apartment
6713 1.0 1.0 39.0 Apartment
6720 1.0 2.0 85.0 Loft
6753 1.0 1.0 39.0 Apartment
6754 1.0 1.0 39.0 Apartment
6771 1.0 1.0 50.0 House
6805 1.0 1.0 35.0 Apartment
6813 1.0 1.0 33.0 Apartment
6818 1.0 2.0 60.0 Apartment
6828 1.0 3.0 60.0 Apartment
6836 1.0 1.0 47.0 Apartment
6854 1.0 2.0 47.0 Apartment
6859 1.0 2.0 40.0 Apartment
6880 1.0 1.0 60.0 Apartment
6912 1.0 1.0 68.0 Loft
6916 1.0 2.0 55.0 Apartment
6917 1.0 1.0 78.0 Apartment
6950 1.0 2.0 45.0 Apartment
6951 1.0 2.0 42.0 Apartment
6957 1.0 3.0 127.0 Apartment
6966 1.0 1.0 80.0 Apartment
6993 1.0 4.0 65.0 Apartment
7044 1.0 2.0 113.0 Apartment
7063 1.0 2.0 40.0 Apartment
7076 1.0 3.0 130.0 Apartment
7077 1.0 1.0 60.0 Other
7110 2.0 1.0 90.0 Loft
7131 1.0 1.0 50.0 Apartment
7139 1.0 2.0 50.0 Apartment
7144 1.0 2.0 59.0 Apartment
7169 1.0 1.0 50.0 Apartment
7177 1.0 1.0 60.0 Apartment
7184 1.0 1.0 39.0 Apartment
7195 1.0 2.0 45.0 Apartment
7198 1.0 1.0 25.0 Apartment
7214 1.0 1.0 50.0 Apartment
7216 1.0 2.0 49.0 Apartment
7226 1.0 2.0 70.0 Apartment
7267 1.0 2.0 45.0 Apartment
7309 1.0 1.0 40.0 Apartment
7315 1.0 1.0 54.0 Apartment
7316 1.0 1.0 75.0 Apartment
7369 1.0 2.0 50.0 Apartment
7372 1.0 1.0 49.0 Apartment
7394 1.0 1.0 72.0 Apartment
7403 1.0 1.0 49.0 Apartment
7406 1.0 1.0 58.0 Apartment
7415 1.0 1.0 59.0 Apartment
7428 1.0 1.0 38.0 Apartment
7433 1.0 1.0 45.0 Apartment
7466 1.0 1.0 23.0 Apartment
7471 1.0 1.0 49.0 Apartment
7491 1.0 1.0 36.0 Loft
7509 1.0 2.0 37.0 Apartment
7512 1.0 3.0 55.0 Apartment
7525 1.0 2.0 100.0 Apartment
7545 1.0 1.0 40.0 Apartment
7550 1.0 1.0 55.0 Loft
7555 1.0 1.0 39.0 Apartment
7585 1.0 2.0 70.0 Apartment
7614 1.0 2.0 82.0 Apartment
7624 1.0 2.0 81.0 Apartment
7629 1.0 1.0 38.0 Apartment
7687 1.0 1.0 65.0 Apartment
7701 1.0 2.0 42.0 Apartment
7720 1.0 2.0 59.0 Apartment
7777 1.0 2.0 30.0 Apartment
7785 1.0 1.0 70.0 Apartment
7794 1.0 1.0 60.0 Apartment
7806 1.0 1.0 85.0 Apartment
7823 1.0 2.0 29.0 Apartment
7847 1.0 2.0 89.0 Apartment
7890 1.0 1.0 60.0 Loft
7932 1.0 1.0 68.0 Apartment
7935 1.0 1.0 95.0 Apartment
7943 1.0 2.0 35.0 Apartment
7956 1.0 2.0 69.0 Loft
7969 1.0 1.0 30.0 Condominium
7976 1.0 1.0 44.0 Apartment
7977 1.0 2.0 60.0 Loft
8020 1.0 2.0 45.0 Apartment
8021 1.0 1.0 80.0 Apartment
8045 1.0 2.0 65.0 Apartment
8046 1.0 1.0 48.0 Apartment
8095 1.0 1.0 82.0 Condominium
8107 1.0 1.0 55.0 Apartment
8110 1.0 2.0 54.0 Apartment
8114 1.0 1.0 59.0 Apartment
8135 1.0 2.0 55.0 Apartment
8136 1.0 1.0 45.0 Apartment
8146 1.0 1.0 50.0 Apartment
8165 1.0 1.0 65.0 Apartment
8166 1.0 1.0 45.0 Loft
8167 1.0 1.0 65.0 Apartment
8187 1.0 1.0 49.0 Apartment
8199 1.0 4.0 89.0 Apartment
8204 1.0 1.0 70.0 Apartment
8221 1.0 1.0 52.0 Condominium
8226 1.0 2.0 45.0 Apartment
8238 1.0 2.0 59.0 Loft
8314 1.0 1.0 55.0 Apartment
8326 1.0 2.0 45.0 Apartment
8333 1.0 1.0 35.0 Apartment
8356 1.0 2.0 85.0 Apartment
8400 1.0 2.0 80.0 Apartment
8402 1.0 1.0 99.0 Loft
8409 1.0 1.0 32.0 Serviced apartment
8418 1.0 1.0 52.0 Apartment
8420 1.0 1.0 44.0 Apartment
8431 1.0 2.0 65.0 Apartment
8442 1.0 1.0 45.0 Apartment
8451 1.0 1.0 70.0 Apartment
8468 1.0 1.0 55.0 Loft
8482 1.0 3.0 61.0 Apartment
8505 1.0 2.0 80.0 Apartment
8516 1.0 2.0 60.0 Apartment
8525 1.0 2.0 68.0 Apartment
8553 1.0 1.0 61.0 Apartment
8573 1.0 1.0 50.0 Apartment
8592 1.0 1.0 30.0 Apartment
8600 1.0 1.0 35.0 Apartment
8614 1.0 2.0 80.0 Loft
8617 1.0 1.0 50.0 Condominium
8621 1.0 2.0 55.0 Apartment
8627 1.0 1.0 55.0 Loft
8638 1.0 1.0 65.0 Apartment
8650 1.0 1.0 43.0 Apartment
8658 1.0 2.0 40.0 Apartment
8659 1.0 1.0 59.0 Apartment
8664 1.0 1.0 43.0 Apartment
8668 1.0 1.0 45.0 Apartment
8669 1.0 2.0 50.0 Apartment
8732 1.0 3.0 50.0 Condominium
8743 1.0 1.0 35.0 Apartment
8772 1.0 1.0 50.0 Apartment
8785 1.0 2.0 55.0 Apartment
8787 1.0 1.0 95.0 Apartment
8794 1.0 1.0 45.0 Apartment
8796 1.0 1.0 109.0 Apartment
8806 1.0 1.0 38.0 Apartment
8808 1.0 2.0 45.0 Apartment
8816 1.0 1.0 45.0 Apartment
8827 1.0 1.0 69.0 Loft
8880 1.0 1.0 45.0 Apartment
8923 1.0 1.0 47.0 Apartment
8929 1.0 1.0 49.0 Apartment
8944 1.0 1.0 55.0 Apartment
8946 1.0 2.0 70.0 Apartment
8961 1.0 1.0 55.0 Apartment
8968 1.0 1.0 70.0 Apartment
8977 1.0 1.0 51.0 Condominium
9004 1.0 1.0 45.0 Apartment
9020 1.0 1.0 65.0 Apartment
9022 1.0 1.0 70.0 Apartment
9029 1.0 1.0 90.0 Apartment
9043 1.0 5.0 50.0 Loft
9077 1.0 1.0 58.0 House
9081 1.0 2.0 35.0 Apartment
9083 1.0 2.0 60.0 Apartment
9091 1.0 2.0 40.0 Apartment
9100 1.0 1.0 75.0 Apartment
9126 1.0 1.0 30.0 Apartment
9187 1.0 1.0 40.0 Apartment
9198 1.0 3.0 70.0 Apartment
9201 1.0 2.0 51.0 Apartment
9221 1.0 1.0 55.0 Loft
9227 1.0 1.0 40.0 Apartment
9236 1.0 2.0 50.0 Apartment
9244 1.0 2.0 35.0 Apartment
9245 1.0 1.0 60.0 Apartment
9249 1.0 2.0 79.0 Apartment
9250 1.0 1.0 32.0 Apartment
9256 1.0 1.0 60.0 Loft
9303 1.0 1.0 50.0 Apartment
9332 1.0 1.0 39.0 Apartment
9336 1.0 1.0 95.0 Apartment
9340 1.0 2.0 65.0 Apartment
9366 1.0 1.0 70.0 Apartment
9372 1.0 3.0 90.0 Apartment
9377 1.0 1.0 70.0 Condominium
9391 1.0 1.0 55.0 Apartment
9395 1.0 1.0 48.0 Apartment
9414 1.0 1.0 65.0 Apartment
9422 1.0 2.0 49.0 Apartment
9433 1.0 1.0 49.0 Apartment
9458 1.0 1.0 60.0 Apartment
9472 1.0 1.0 55.0 Apartment
9478 1.0 2.0 40.0 Apartment
9498 1.0 2.0 79.0 Apartment
9518 1.0 1.0 55.0 Apartment
9523 1.0 1.0 45.0 Apartment
9526 1.0 1.0 40.0 Apartment
9567 1.0 2.0 70.0 Apartment
9575 1.0 1.0 65.0 Loft
9579 1.0 1.0 35.0 Apartment
9596 1.0 1.0 89.0 Apartment
9597 1.0 1.0 54.0 Apartment
9601 1.0 2.0 64.0 Apartment
9606 1.0 2.0 39.0 Apartment
9612 1.0 2.0 80.0 Apartment
9627 1.0 2.0 50.0 Apartment
9628 1.0 1.0 30.0 Apartment
9643 1.0 1.0 52.0 Loft
9653 1.0 2.0 63.0 Apartment
9659 1.0 1.0 59.0 Apartment
9702 1.0 2.0 80.0 House
9729 1.0 1.0 50.0 Apartment
9733 1.0 2.0 45.0 Apartment
9744 1.0 1.0 70.0 Apartment
9754 1.0 3.0 89.0 Apartment
9758 1.0 1.0 90.0 House
9765 1.0 1.0 40.0 Apartment
9780 1.0 1.0 90.0 Apartment
9791 1.0 1.0 60.0 Apartment
9792 1.0 3.0 125.0 Apartment
9801 1.0 1.0 75.0 Loft
9824 1.0 2.0 65.0 Apartment
9843 1.0 1.0 97.0 Apartment
9866 1.0 1.0 50.0 Apartment
9888 1.0 1.0 39.0 Apartment
9896 1.0 1.0 69.0 Apartment
9925 1.0 1.0 43.0 Apartment
9929 1.0 3.0 32.0 Apartment
9957 1.0 1.0 48.0 Apartment
10015 1.0 1.0 80.0 Apartment
10034 1.0 1.0 40.0 Apartment
10047 1.0 1.0 37.0 Loft
10048 1.0 2.0 51.0 Apartment
10050 1.0 1.0 50.0 Apartment
10070 1.0 2.0 61.0 Apartment
10072 1.0 2.0 45.0 Apartment
10078 1.0 1.0 112.0 Apartment
10079 1.0 1.0 75.0 Apartment
10099 1.0 2.0 60.0 Loft
10104 1.0 1.0 49.0 Apartment
10114 1.0 1.0 55.0 Apartment
10135 1.0 2.0 65.0 Apartment
10149 1.0 3.0 55.0 Apartment
10152 1.0 1.0 55.0 Apartment
10153 1.0 1.0 45.0 Apartment
10164 1.0 1.0 77.0 Apartment
10170 1.0 3.0 69.0 Apartment
10172 1.0 1.0 49.0 Apartment
10178 1.0 1.0 65.0 Apartment
10208 1.0 1.0 100.0 Apartment
10210 1.0 1.0 45.0 Apartment
10221 1.0 2.0 125.0 Apartment
10231 1.0 1.0 95.0 Apartment
10248 1.0 1.0 65.0 Loft
10272 1.0 1.0 32.0 Serviced apartment
10284 1.0 2.0 70.0 Loft
10289 1.0 2.0 42.0 Apartment
10312 1.0 1.0 45.0 Apartment
10321 1.0 2.0 50.0 Loft
10322 1.0 1.0 48.0 Apartment
10327 1.0 4.0 80.0 Apartment
10332 1.0 1.0 71.0 Apartment
10372 1.0 1.0 45.0 Apartment
10378 1.0 1.0 89.0 Apartment
10407 1.0 2.0 70.0 Apartment
10408 1.0 1.0 70.0 Loft
10422 1.0 1.0 58.0 Apartment
10423 1.0 2.0 36.0 Apartment
10446 1.0 1.0 60.0 Apartment
10468 1.0 1.0 75.0 Apartment
10484 1.0 2.0 41.0 Apartment
10503 1.0 1.0 45.0 Apartment
10515 1.0 1.0 105.0 Apartment
10533 1.0 1.0 40.0 Apartment
10575 1.0 1.0 52.0 Apartment
10602 1.0 2.0 63.0 Apartment
Registros con 7 baños o más:
Bathrooms Beds Price Property Type
6674 8.0 16.0 45.0 Hostel
6735 8.0 16.0 29.0 Apartment
9259 7.0 7.0 65.0 Hostel
print(f"Registros con 0 baños:\n{data_train[data_train['Bathrooms'] == 0][['Bedrooms', 'Beds', 'Price', 'Property Type']]}")
print(f"\nApartamentos con 16 camas:\n{data_train[(data_train['Beds'] == 16) & (data_train['Property Type'] == 'Apartment')][['Price', 'Bedrooms', 'Bathrooms', 'Property Type']]}")
Registros con 0 baños:
Bedrooms Beds Price Property Type
30 1.0 1.0 75.0 Apartment
71 1.0 1.0 24.0 Bed & Breakfast
773 1.0 2.0 22.0 Apartment
1138 1.0 2.0 50.0 Hostel
1254 1.0 1.0 33.0 Bed & Breakfast
1592 1.0 1.0 20.0 Apartment
1675 1.0 2.0 24.0 Apartment
1699 1.0 8.0 21.0 Bed & Breakfast
1751 1.0 1.0 30.0 Apartment
1752 1.0 1.0 30.0 Apartment
1899 1.0 1.0 30.0 Apartment
1912 1.0 1.0 149.0 Dorm
2064 1.0 1.0 29.0 Apartment
2322 1.0 1.0 29.0 House
2405 1.0 1.0 26.0 Apartment
2914 1.0 1.0 29.0 House
3280 1.0 1.0 30.0 Dorm
3458 1.0 2.0 39.0 House
3513 1.0 3.0 53.0 Guesthouse
3538 1.0 1.0 27.0 Condominium
3874 1.0 1.0 29.0 House
4112 1.0 1.0 20.0 Apartment
4124 1.0 2.0 18.0 Bed & Breakfast
4507 1.0 2.0 39.0 House
4770 1.0 1.0 31.0 Apartment
4925 1.0 1.0 9.0 Other
5098 1.0 1.0 25.0 Apartment
5905 1.0 10.0 20.0 Bed & Breakfast
6006 1.0 1.0 35.0 Apartment
6729 1.0 3.0 18.0 Dorm
6918 1.0 2.0 39.0 House
7053 1.0 1.0 49.0 Apartment
7246 1.0 1.0 20.0 Apartment
7706 1.0 1.0 35.0 Apartment
7916 10.0 2.0 39.0 House
8060 1.0 2.0 39.0 House
8273 1.0 1.0 29.0 Apartment
8406 1.0 2.0 39.0 House
8629 1.0 1.0 43.0 Apartment
8677 1.0 1.0 20.0 Bed & Breakfast
8767 1.0 1.0 30.0 Bed & Breakfast
8780 1.0 2.0 25.0 Apartment
8797 1.0 1.0 35.0 Apartment
8851 1.0 2.0 35.0 Bed & Breakfast
8903 1.0 1.0 49.0 Apartment
8936 1.0 1.0 35.0 Apartment
9401 1.0 1.0 35.0 Apartment
9473 1.0 1.0 20.0 Bed & Breakfast
9625 1.0 1.0 35.0 Apartment
9812 1.0 1.0 50.0 Apartment
9870 1.0 4.0 24.0 Bed & Breakfast
10202 1.0 1.0 35.0 Apartment
10376 1.0 1.0 30.0 Apartment
Apartamentos con 16 camas:
Price Bedrooms Bathrooms Property Type
664 200.0 6.0 2.0 Apartment
2437 400.0 6.0 6.0 Apartment
3859 180.0 10.0 6.0 Apartment
5502 80.0 4.0 2.0 Apartment
5557 850.0 10.0 6.5 Apartment
5705 399.0 8.0 4.5 Apartment
6539 200.0 5.0 2.0 Apartment
6735 29.0 1.0 8.0 Apartment
7303 550.0 7.0 3.0 Apartment
8286 338.0 5.0 2.0 Apartment
Del análisis anterior podemos concluir lo siguiente:
Inconsistencia en los datos: apartamentos que tienen 10 dormitorios pero solo 1 y 2 camas respectivamente (ID 1782 y 1916), son inusuales. Normalmente, esperaríamos un número mayor de camas en propiedades con tantos dormitorios. Esto podría ser un error en los datos o una situación atípica donde los dormitorios no están destinados a ser utilizados con camas tradicionales (por ejemplo, pueden ser utilizados como oficinas o espacios de trabajo).
El apartamento con el ID 7916 también indica 0 baños, lo cual es muy improbable para una propiedad con 10 dormitorios y sugiere un error de ingreso de datos o una omisión. Cada casa se espera que tenga al menos un baño. Un error de entrada de datos podría ser la causa aquí, o una interpretación incorrecta de un baño compartido o externo que no se contó.
Establecer el límite de outliers: Quiero se conservadora en la eliminación de outliers pero deseo descartar registros muy inconsistentes, por lo que decido establecer un umbral basado en el conocimiento del dominio en lugar del rango intercuartílico.
sns.set_theme(style="white")
sns.relplot(x='Square Feet', y='Price', data=data_train, hue='Room Type', alpha=0.8)
plt.title('Relación entre Pies Cuadrados y Precio')
plt.xlabel('Pies Cuadrados', fontsize=10)
plt.ylabel('Precio', fontsize=10)
plt.show()
Tenemos en cuenta en el análisis anterior que hay muchos datos faltantes en la columna 'Square Feet'
corr_matrix = data_train.select_dtypes(include=[np.number]).corr()
attributes = ['Price','Accommodates','Bathrooms','Bedrooms','Beds']
plot_scatter_matrix(data_train, attributes)
# Filtro para propiedades con más de 2500 SquareFeet
properties_above_2500sqft = data_train[data_train['Square Feet'] > 2500]
print("Propiedades con más de 2500 SquareFeet:")
print(properties_above_2500sqft[['Square Feet', 'Price', 'Room Type', 'Bathrooms', 'Bedrooms', 'Beds']])
# Filtro para apartamentos con 0 SquareFeet
apartments_with_0sqft = data_train[data_train['Square Feet'] == 0]
print("\nApartamentos con 0 SquareFeet:")
print(apartments_with_0sqft[['Square Feet', 'Price', 'Bathrooms', 'Bedrooms', 'Beds']].describe())
Propiedades con más de 2500 SquareFeet:
Square Feet Price Room Type Bathrooms Bedrooms Beds
1495 2691.0 35.0 Private room 1.0 1.0 1.0
7346 2691.0 25.0 Private room 2.5 1.0 3.0
8580 2691.0 20.0 Private room 2.0 1.0 7.0
Apartamentos con 0 SquareFeet:
Square Feet Price Bathrooms Bedrooms Beds
count 169.0 169.000000 165.000000 169.000000 169.000000
mean 0.0 65.147929 1.230303 1.260355 2.272189
std 0.0 39.809315 0.547992 0.734135 1.821628
min 0.0 15.000000 0.000000 0.000000 1.000000
25% 0.0 37.000000 1.000000 1.000000 1.000000
50% 0.0 60.000000 1.000000 1.000000 2.000000
75% 0.0 80.000000 1.000000 1.000000 3.000000
max 0.0 250.000000 4.000000 4.000000 16.000000
Los registros que muestras indican propiedades con más de 2500 'Square Feet' (pies cuadrados) pero con precios muy bajos, y todas están listadas como habitaciones privadas ('Private room'). Esto podría parecer inusual por varias razones:
Un tamaño de 2691 pies cuadrados es significativo para una propiedad, y uno esperaría que este tipo de propiedad sea un apartamento completo, una casa, o posiblemente una suite de lujo, no solo una habitación privada. Esto podría sugerir un error en la categorización o en el ingreso de los datos de tamaño.
Para propiedades de este tamaño, los precios listados (35, 25, y 20) son extremadamente bajos, lo cual no parece coherente con el mercado inmobiliario para propiedades de tal tamaño, incluso si solo se está alquilando una habitación privada dentro de la propiedad.
Los detalles adicionales, como la cantidad de baños y camas, especialmente en el registro con 7 camas pero solo listado como una habitación privada, también parecen inusuales y podrían indicar un error en la entrada de datos o una interpretación incorrecta de cómo se deben listar las propiedades.
Respecto a los apartamentos con 0 pies cuadrados:
Todos los apartamentos en este subconjunto tienen 0 'Square Feet', lo que indica que o bien esta información no fue recopilada o ingresada correctamente, o representa un marcador de posición para valores desconocidos. Es altamente improbable que estos apartamentos literalmente no tengan área, lo que sugiere un problema de datos.
El precio promedio de estos apartamentos es de aproximadamente 65.15, con un rango que va desde 15 hasta 250. Esto muestra que, a pesar de la falta de información sobre el tamaño, los precios varían significativamente, lo cual es esperable en el mercado inmobiliario, pero difícil de justificar sin conocer el tamaño de la propiedad.
El mínimo en baños y dormitorios es 0, lo que podría ser correcto para ciertos tipos de alojamientos (como estudios), pero también merece una revisión detallada para asegurar que no se trate de errores de entrada.
correlation_matrix = data_train[['Accommodates','Bathrooms','Bedrooms','Beds','Square Feet','Price']].corr()
plt.figure(figsize=(5, 4))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', cbar=True)
plt.show()
Nuevas columnas que se plantean:
BedsPerRoom (nº camas / nº habitaciones): calcula el ratio de camas por habitaciónBathsPerRoom (nº baños / nº habitaciones): podría ser un indicador de lujo y conveniencia (afectaría al precio)PricePerBed (precio / nº camas): da una idea del costo por personaBedBathRooms (nº dormitorios * nº baños): relación entre el número de dormitorios y de bañosColumnas que no se incluirán en el modelo:
Square Feet (muchos datos faltantes, 96,15% de los registros)Columnas de texto/categóricas que se convertirán a categorías numéricas:
Property Type (primero clasificar en 6 valores únicos: Apartment' 'Loft' 'House' 'Bed & Breakfast' ‘Condominium' 'Others' y después codificar)Room Type (target encoding)Bed Type (pasar a binaria 0 - 1 en función de si es Real Bed o no)Amenities (recuento del listado de Amenities)Columnas numéricas que se mantienen:
Accommodates Bathrooms, Bedrooms, Beds columns_to_include = ['Price', 'Weekly Price', 'Monthly Price', 'Security Deposit',
'Cleaning Fee', 'Guests Included', 'Extra People']
plot_histograms(data_train, columns_to_include[:3], bins=15, figsize=(20, 4))
plot_histograms(data_train, columns_to_include[3:], bins=15, figsize=(20, 4))
sns.set_style("whitegrid")
plt.figure(figsize=(12, 1.4))
sns.boxplot(x=data_train['Price'], color='cornflowerblue')
plt.title('Boxplot de Precios por Noche', fontsize='11')
plt.xlabel('Precio', fontsize='10')
sns.despine(trim=True)
plt.show()
Hay una cantidad significativa de outliers, lo que indica variabilidad en los precios. El valor de la mediana (línea dentro de la caja) será un mejor indicador de la 'tarifa típica' que la media debido a estos outliers.
sns.set_style("whitegrid")
plt.figure(figsize=(12, 1.4))
sns.boxplot(x=data_train['Extra People'], color='cornflowerblue')
plt.title('Boxplot de Coste Adicional por Huésped Extra', fontsize='11')
plt.xlabel('Precio', fontsize='10')
sns.despine(trim=True)
plt.show()
data_train[data_train['Extra People'] > 100][['Extra People', 'Property Type', 'Price',
'Bedrooms', 'Bathrooms', 'Beds', 'Neighbourhood Cleansed']]
| Extra People | Property Type | Price | Bedrooms | Bathrooms | Beds | Neighbourhood Cleansed | |
|---|---|---|---|---|---|---|---|
| 2495 | 276 | Apartment | 55.0 | 1.0 | 1.5 | 1.0 | Trafalgar |
| 4997 | 250 | Chalet | NaN | 3.0 | 3.0 | 3.0 | Niño Jesús |
| 5014 | 150 | Apartment | 120.0 | 1.0 | 1.0 | 2.0 | Universidad |
| 5591 | 250 | House | NaN | 3.0 | 3.0 | 3.0 | Niño Jesús |
| 8646 | 274 | Apartment | 75.0 | 1.0 | 1.0 | 1.0 | Universidad |
| 10466 | 250 | Apartment | 500.0 | 1.0 | 1.0 | 3.0 | Embajadores |
attributes = ['Price', 'Weekly Price', 'Monthly Price', 'Security Deposit',
'Cleaning Fee', 'Guests Included', 'Extra People']
plot_scatter_matrix(data_train, attributes)
correlation_matrix = data_train[['Weekly Price','Monthly Price','Security Deposit','Guests Included',
'Cleaning Fee','Extra People','Bedrooms','Bathrooms','Beds','Price']].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', cbar=True)
plt.show()
Columnas que no se incluirán en el modelo:
Weekly Price, Monthly Price, Security Deposit (muchos valores faltantes)Columnas numéricas que se mantienen:
Cleaning Fee (hay que imputar, tiene 4347 datos faltantes) Guests IncludedExtra People, decido mantenerla a pesar de tener una correlación baja con el precio (puedo incluirla en el modelo predictivo inicial y luego evaluar la importancia). Si los métodos de selección de características o la importancia de las variables en el modelo indican que 'Extra People' no es significativa, consideraré excluirla del modelo final.columns_to_include = ['Minimum Nights', 'Maximum Nights', 'Availability 30',
'Availability 60', 'Availability 90', 'Availability 365']
plot_histograms(data_train, columns_to_include[:3], bins=15, figsize=(20, 4))
plot_histograms(data_train, columns_to_include[3:], bins=15, figsize=(20, 4))
sns.set_style("whitegrid")
plt.figure(figsize=(12, 1.4))
sns.boxplot(x=data_train['Minimum Nights'], color='cornflowerblue')
plt.title('Distribución de Noches Mínimas', fontsize='11')
plt.xlabel('Noches', fontsize='10')
sns.despine(trim=True)
plt.show()
plt.figure(figsize=(12, 1.4))
sns.boxplot(x=data_train['Maximum Nights'], color='cornflowerblue')
plt.title('Distribución de Noches Máximas', fontsize='11')
plt.xlabel('Noches', fontsize='10')
sns.despine(trim=True)
plt.show()
Utilizaré los valores de las columnas Maximum Night y Minimum Nights para crear una nueva columna que refleja la duración de la estancia (corta, media, larga, variable)
attributes = ['Availability 30', 'Availability 60', 'Availability 90', 'Availability 365', 'Price']
plot_scatter_matrix(data_train, attributes)
Los gráficos no parecen mostrar una relación lineal clara entre la disponibilidad y el precio. Esto podría significar que la disponibilidad no afecta directamente el precio o que cualquier relación puede ser no lineal o influenciada por otros factores.
correlation_matrix = data_train[['Minimum Nights','Maximum Nights', 'Availability 30',
'Availability 60','Availability 90','Availability 365','Price']].corr()
plt.figure(figsize=(6, 4))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', cbar=True)
plt.show()
Nuevas columnas:
Stay Duration (duración de la estancia en base a minimum nights y maximum nights)Columnas que no se incluirán en el modelo:
Availability 30, Availability 60, Availability 90, Availability 365 (corr baja)Minimum Nights, Maximum Nights (utilizamos 'Stay Duration')Calendar Updated (información menos relevante)columns_to_include = ['Number of Reviews','Review Scores Rating','Review Scores Accuracy',
'Review Scores Cleanliness','Review Scores Checkin','Review Scores Communication',
'Review Scores Location','Review Scores Value','Reviews per Month',
'Calculated host listings count']
plot_histograms(data_train, columns_to_include[:5], bins=15, figsize=(20, 4))
plot_histograms(data_train, columns_to_include[5:], bins=15, figsize=(20, 4))
attributes = ['Number of Reviews','Review Scores Rating','Review Scores Accuracy','Review Scores Cleanliness',
'Review Scores Checkin','Review Scores Communication','Review Scores Location',
'Review Scores Value','Reviews per Month','Calculated host listings count']
plot_scatter_matrix(data_train, attributes, figsize=(20, 12))
# Seleccionamos las columnas de interés para la correlación
columns_of_interest = ['Number of Reviews','Review Scores Rating','Review Scores Accuracy',
'Review Scores Cleanliness','Review Scores Checkin','Review Scores Communication',
'Review Scores Location','Review Scores Value','Reviews per Month',
'Calculated host listings count','Price']
correlation_matrix = data_train[columns_of_interest].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', cbar=True)
plt.show()
Columnas que no se incluirán en el modelo:
Number of Reviews, Review Scores Rating,Review Scores Accuracy, Review Scores Cleanliness, Review Scores Checkin, Review Scores Communication, Review Scores Location, Review Scores Value, Reviews per Month, Calculated host listings count (corr baja)Columnas de texto/categóricas que se convertirán a categorías numéricas:
Features(recuento total de features)Cancellation Policity (8 valores únicos)Tras el análisis anterior sospecho que algunos de los alojamientos con valores atípicos en el número de dormitorios y de baños podrían representar una situación única y por tanto real, como un hostal o un tipo de alojamiento compartido, decido analizar la siguiente situación, eliminando los registros que tienen valores elevados en esas características.
sns.set_style("whitegrid")
data_train_no_outliers = data_train[~((data_train['Bedrooms'] > 6) & (data_train['Beds'] < 4))]
data_train_no_outliers2 = data_train[data_train['Bathrooms'] <= 7]
data_train_no_outliers3 = data_train[data_train['Bedrooms'] <= 8]
fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(18, 6))
# Primer gráfico: Dormitorios vs Precio
axs[0].scatter(data_train_no_outliers['Bedrooms'], data_train_no_outliers['Price'], color='royalblue')
axs[0].set_xlabel('Dormitorios')
axs[0].set_ylabel('Precio')
axs[0].set_title('Alojamientos con menos de 6 dormitorios y más de 4 camas')
# Tercer gráfico: Dormitorios vs Precio (sin outliers2)
axs[1].scatter(data_train_no_outliers2['Bedrooms'], data_train_no_outliers2['Price'], color='crimson')
axs[1].set_xlabel('Dormitorios')
axs[1].set_ylabel('Precio')
axs[1].set_title('Alojamientos con menos de 7 baños')
# Quinto gráfico: Dormitorios vs Precio (sin outliers2)
axs[2].scatter(data_train_no_outliers3['Bedrooms'], data_train_no_outliers3['Price'], color='lightseagreen')
axs[2].set_xlabel('Dormitorios')
axs[2].set_ylabel('Precio')
axs[2].set_title('Alojamientos con menos de 8 dormitorio')
plt.tight_layout()
sns.despine(top=True, right=True, left=False, bottom=False)
fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(18, 6))
# Segundo gráfico: Baños vs Precio
axs[0].scatter(data_train_no_outliers['Bathrooms'], data_train_no_outliers['Price'], color='royalblue')
axs[0].set_xlabel('Baños')
axs[0].set_ylabel('Precio')
axs[0].set_title('Alojamientos con menos de 6 dormitorios y más de 4 camas')
# Cuarto gráfico: Baños vs Precio (sin outliers2)
axs[1].scatter(data_train_no_outliers2['Bathrooms'], data_train_no_outliers2['Price'], color='crimson')
axs[1].set_xlabel('Baños')
axs[1].set_ylabel('Precio')
axs[1].set_title('Alojamientos con menos de 7 baños')
# Sexto gráfico: Baños vs Precio (sin outliers2)
axs[2].scatter(data_train_no_outliers3['Bathrooms'], data_train_no_outliers3['Price'], color='lightseagreen')
axs[2].set_xlabel('Baños')
axs[2].set_ylabel('Precio')
axs[2].set_title('Alojamientos con menos de 8 dormitorios')
plt.tight_layout()
sns.despine(top=True, right=True, left=False, bottom=False)
plt.show()
print(f'Original: {data_train.shape[0]} // Modificado: {data_train_no_outliers.shape[0]}\nDiferencia: {data_train.shape[0] - data_train_no_outliers.shape[0]}')
print(f'Variación: {((data_train.shape[0] - data_train_no_outliers.shape[0])/data_train.shape[0])*100:2f}%')
print(f'\nOriginal: {data_train.shape[0]} // Modificado: {data_train_no_outliers2.shape[0]}\nDiferencia: {data_train.shape[0] - data_train_no_outliers2.shape[0]}')
print(f'Variación: {((data_train.shape[0] - data_train_no_outliers2.shape[0])/data_train.shape[0])*100:2f}%')
print(f'\nOriginal: {data_train.shape[0]} // Modificado: {data_train_no_outliers3.shape[0]}\nDiferencia: {data_train.shape[0] - data_train_no_outliers3.shape[0]}')
print(f'Variación: {((data_train.shape[0] - data_train_no_outliers3.shape[0])/data_train.shape[0])*100:2f}%')
Original: 10609 // Modificado: 10607 Diferencia: 2 Variación: 0.018852% Original: 10609 // Modificado: 10569 Diferencia: 40 Variación: 0.377038% Original: 10609 // Modificado: 10590 Diferencia: 19 Variación: 0.179093%
La aplicación cuidadosa de los filtros excluye menos del 1% del conjunto de datos, eliminando únicamente las instancias más atípicas sin comprometer la representatividad global. Este enfoque selectivo y minucioso garantiza la retención de información crucial para el análisis, manteniendo la calidad y la fidelidad de los datos para modelado futuro.
#data_train.select_dtypes(include=[np.number]).corr()
Características con correlaciones más altas con Price:
corr_matrix = data_train.select_dtypes(include=[np.number]).corr()
# Filtramos para mostrar solo las correlaciones de 'Price' superiores a 0.4 o inferiores a -0.4
strong_correlations = corr_matrix["Price"][(corr_matrix["Price"] > 0.4) | (corr_matrix["Price"] < -0.4)]
print(strong_correlations.sort_values(ascending=False))
Price 1.000000 Monthly Price 0.838666 Weekly Price 0.830605 Cleaning Fee 0.656235 Accommodates 0.572179 Bedrooms 0.514831 Beds 0.471162 Name: Price, dtype: float64
data = data_train.select_dtypes(include=[np.number])
corr = np.abs(data.drop(['Price'], axis=1).corr())
mask = np.zeros_like(corr, dtype = bool)
mask[np.triu_indices_from(mask)] = True
f, ax = plt.subplots(figsize=(12, 10))
sns.heatmap(corr, mask=mask,vmin = 0.0, vmax=1.0, center=0.5,
linewidths=.1, cmap="YlGnBu", cbar_kws={"shrink": .8})
plt.show()
LatLongInteraction (multiplicacion de longitude y latitude)BedsPerRoom (nº camas / nº habitaciones) - Relación Dormitorios-CamasBathsPerRoom (nº baños / nº habitaciones) - Relación Baños-HabitacionesPricePerBed (precio / nº camas) - Precio por camaBedBathRooms (nº dormitorios nº baños) - Relación Dormitorios-Baños*Stay Duration (duración de la estancia en base a minimum nights y maximum nights)data_train['LatLongInteraction'] = data_train['Latitude'] * data_train['Longitude']
# Si el número de dormitorios es mayor que cero, realiza la división, de lo contrario devuelve cero.
data_train['BedsPerRoom'] = data_train.apply(lambda row: row['Beds'] / row['Bedrooms'] if row['Bedrooms'] > 0 else 0, axis=1)
data_train['BathsPerRoom'] = data_train.apply(lambda row: row['Bathrooms'] / row['Bedrooms'] if row['Bedrooms'] > 0 else 0, axis=1)
data_train['PricePerBed'] = data_train['Price'] / data_train['Beds']
data_train['BedBathRooms'] = data_train['Bedrooms'] * data_train['Bathrooms']
print(data_train[['LatLongInteraction','BedsPerRoom','BathsPerRoom','PricePerBed','BedBathRooms']].describe())
LatLongInteraction BedsPerRoom BathsPerRoom PricePerBed \
count 10609.000000 10583.000000 10571.000000 10568.000000
mean -149.441245 1.356939 0.952190 37.987937
std 0.944412 0.932363 0.577699 28.433538
min -156.730652 0.000000 0.000000 1.100000
25% -149.860933 1.000000 0.666667 22.312500
50% -149.614689 1.000000 1.000000 30.000000
75% -149.284273 2.000000 1.000000 46.000000
max -142.562833 16.000000 8.000000 650.000000
BedBathRooms
count 10557.000000
mean 1.842853
std 2.583767
min 0.000000
25% 1.000000
50% 1.000000
75% 2.000000
max 65.000000
# Creamos una columna que refleje la duración de la estancia
def classify_stay(min_nights, max_nights):
if min_nights == 1 and max_nights <= 7:
return 1 # Estancias cortas
elif 7 < min_nights <= 30 or (min_nights <= 7 and 7 < max_nights <= 30):
return 2 # Estancias medias
elif 30 < min_nights <= 365 or (min_nights <= 30 and 30 < max_nights <= 365):
return 3 # Estancias largas
elif min_nights > 365 or max_nights > 365:
return 4 # Estancias muy largas
else:
return 0 # Casos atípicos o no clasificables
data_train['Stay Duration'] = data_train.apply(lambda row: classify_stay(row['Minimum Nights'], row['Maximum Nights']), axis=1)
print(data_train[['Stay Duration', 'Minimum Nights', 'Maximum Nights']].describe())
print(data_train[['Stay Duration']].value_counts())
#data_train[['Stay Duration', 'Minimum Nights', 'Maximum Nights']].head()
Stay Duration Minimum Nights Maximum Nights count 10609.000000 10609.000000 10609.000000 mean 3.435197 3.014045 980.265906 std 0.982103 12.840494 9908.895741 min 0.000000 1.000000 1.000000 25% 3.000000 1.000000 365.000000 50% 4.000000 2.000000 1125.000000 75% 4.000000 3.000000 1125.000000 max 4.000000 1125.000000 1000000.000000 Stay Duration 4 7421 2 1523 3 1147 1 273 0 245 Name: count, dtype: int64
Se excluirán del análisis los alojamientos con:
Surge la duda de si eliminar los alojamientos con 0 pies cuadrados (metros cuadrados) debido a la poca fiabilidad de los datos de esa columna. Si hubiera más datos o una relación más coherente, se habría aplicado este filtro.
# Filtramos posibles outliers
def filter_data(data):
filtered_data = data[~((data['Bedrooms'] > 6) & (data['Beds'] < 4))]
filtered_data = filtered_data[filtered_data['Bedrooms'] <= 8]
filtered_data = filtered_data[(filtered_data['Bathrooms'] <= 6) & (filtered_data['Bathrooms'] > 0)]
return filtered_data
data_train = filter_data(data_train)
Información menos relavante
Requeriría procesamiento de texto
URLs de imágenes
Presenta muchos valores faltantes
Contenido duplicado o redundante
Baja correlación con precio
columns_to_drop = [
'ID', 'Listing Url', 'Scrape ID', 'Last Scraped', 'Calendar last Scraped',
'License', 'Jurisdiction Names', 'Host URL', 'Host About', 'Name', 'Summary',
'Space', 'Description', 'Notes', 'Neighborhood Overview', 'Transit', 'Access',
'Interaction', 'House Rules', 'Thumbnail Url', 'Medium Url', 'Picture Url',
'XL Picture Url', 'Host Thumbnail Url', 'Host Picture Url', 'Host Acceptance Rate',
'Has Availability', 'Host Total Listings Count', 'Geolocation', 'Neighbourhood',
'Host ID', 'Host Name', 'Host Location', 'Host Neighbourhood', 'Host About',
'Host Total Listings Count', 'Experiences Offered', 'Street', 'City', 'State',
'Country Code', 'Country', 'Market', 'Smart Location', 'Latitude', 'Longitude', 'Zipcode',
'Host Response Time', 'Square Feet', 'Weekly Price', 'Monthly Price', 'Security Deposit',
'Availability 30', 'Availability 60', 'Availability 90', 'Availability 365', 'Minimum Nights',
'Maximum Nights', 'Calendar Updated', 'Geolocation', 'Number of Reviews',
'Review Scores Accuracy', 'Review Scores Cleanliness', 'Review Scores Rating',
'Review Scores Checkin', 'Review Scores Communication', 'Review Scores Location',
'Review Scores Value', 'Reviews per Month', 'Calculated host listings count',
'Host Response Rate', 'First Review', 'Last Review'
]
data_train = data_train.drop(columns_to_drop, axis=1)
data_train.info()
<class 'pandas.core.frame.DataFrame'> Index: 10497 entries, 0 to 10608 Data columns (total 25 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Host Since 10495 non-null object 1 Host Listings Count 10495 non-null float64 2 Host Verifications 10494 non-null object 3 Neighbourhood Cleansed 10497 non-null object 4 Neighbourhood Group Cleansed 10497 non-null object 5 Property Type 10497 non-null object 6 Room Type 10497 non-null object 7 Accommodates 10497 non-null int64 8 Bathrooms 10497 non-null float64 9 Bedrooms 10497 non-null float64 10 Beds 10490 non-null float64 11 Bed Type 10497 non-null object 12 Amenities 10434 non-null object 13 Price 10489 non-null float64 14 Cleaning Fee 6227 non-null float64 15 Guests Included 10497 non-null int64 16 Extra People 10497 non-null int64 17 Cancellation Policy 10497 non-null object 18 Features 10497 non-null object 19 LatLongInteraction 10497 non-null float64 20 BedsPerRoom 10490 non-null float64 21 BathsPerRoom 10497 non-null float64 22 PricePerBed 10482 non-null float64 23 BedBathRooms 10497 non-null float64 24 Stay Duration 10497 non-null int64 dtypes: float64(11), int64(4), object(10) memory usage: 2.1+ MB
data_train['Cancellation Policy'].value_counts()
Cancellation Policy strict 3980 flexible 3385 moderate 3126 super_strict_60 4 super_strict_30 2 Name: count, dtype: int64
# Calculamos la antigüedad en días desde la fecha 'Host Since' hasta la fecha actual
data_train['Host Since'] = pd.to_datetime(data_train['Host Since'])
data_train['Host Since'] = (pd.Timestamp.now().normalize() - data_train['Host Since']).dt.days
# Indicará la cantidad total de verificaciones
data_train['Host Verifications'] = data_train['Host Verifications'].apply(lambda x: len(x.split(',')) if pd.notna(x) else 0)
# Creamos una nueva columna que sea 1 si 'Bed Type' es 'Real Bed' y 0 en caso contrario
data_train['Bed Type'] = (data_train['Bed Type'] == 'Real Bed').astype(int)
# Número total de comodidades
data_train['Amenities'] = data_train['Amenities'].dropna().str.split(',').apply(len)
# Contamos el número de Features por registro y creamos una nueva columna con esta información
data_train['Features'] = data_train['Features'].dropna().str.split(',').apply(len)
data_train['Property Type'].value_counts()
Property Type Apartment 8679 House 756 Condominium 279 Bed & Breakfast 259 Loft 223 Other 174 Guesthouse 29 Dorm 29 Chalet 21 Serviced apartment 12 Hostel 8 Townhouse 8 Villa 4 Boutique hotel 4 Casa particular 3 Guest suite 3 Earth House 3 Camper/RV 2 Tent 1 Name: count, dtype: int64
# Función para agrupar las categorías de 'Property Type'
def group_property_type(dataframe):
categories_to_keep = ['Apartment','House','Condominium','Bed & Breakfast','Loft']
return dataframe['Property Type'].apply(lambda x: x if x in categories_to_keep else 'Others')
data_train['Property Type'] = group_property_type(data_train)
data_train['Property Type'].value_counts()
Property Type Apartment 8679 House 756 Others 301 Condominium 279 Bed & Breakfast 259 Loft 223 Name: count, dtype: int64
categorical = ['Neighbourhood Cleansed', 'Neighbourhood Group Cleansed', 'Property Type', 'Room Type']
mean_map = {}
for c in categorical:
# Calcula el promedio de precio para cada categoría
mean = data_train.groupby(c)['Price'].mean()
data_train[c] = data_train[c].map(mean)
mean_map[c] = mean
En las políticas de cancelación decidimos usar un codificador ordinal ya que tienen un orden inherente (de menos a más estrictas). La jerarquía sería flexible < moderate < strict < super_strict_30 < super_strict_60 (asumiendo que super_strict_60 es más estricta que super_strict_30). Podrían unirse las categorías super_strict_60 y super_strict_30 en un solo grupo (debido a su baja frecuencia).
# Unir las categorías 'super_strict_60' y 'super_strict_30'
data_train['Cancellation Policy'] = data_train['Cancellation Policy'].replace(['super_strict_60', 'super_strict_30'], 'super_strict')
print(data_train['Cancellation Policy'].value_counts())
Cancellation Policy strict 3980 flexible 3385 moderate 3126 super_strict 6 Name: count, dtype: int64
from sklearn.preprocessing import OrdinalEncoder
categories = [['flexible', 'moderate', 'strict', 'super_strict']]
cancel_cat = data_train[['Cancellation Policy']]
ordinal_encoder = OrdinalEncoder(categories=categories)
encoded_data = ordinal_encoder.fit_transform(cancel_cat)
print(encoded_data)
# Para convertir de vuelta a las etiquetas originales si fuera necesario
# decoded_data = encoder.inverse_transform(encoded_data)
# print(encoded_data)
[[1.] [1.] [1.] ... [1.] [1.] [3.]]
data_train['Cancellation Policy'] = encoded_data
print(data_train['Cancellation Policy'].dtype)
float64
# Verificamos que todas las categorías son numéricas
data_train.info()
<class 'pandas.core.frame.DataFrame'> Index: 10497 entries, 0 to 10608 Data columns (total 25 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Host Since 10495 non-null float64 1 Host Listings Count 10495 non-null float64 2 Host Verifications 10497 non-null int64 3 Neighbourhood Cleansed 10497 non-null float64 4 Neighbourhood Group Cleansed 10497 non-null float64 5 Property Type 10497 non-null float64 6 Room Type 10497 non-null float64 7 Accommodates 10497 non-null int64 8 Bathrooms 10497 non-null float64 9 Bedrooms 10497 non-null float64 10 Beds 10490 non-null float64 11 Bed Type 10497 non-null int64 12 Amenities 10434 non-null float64 13 Price 10489 non-null float64 14 Cleaning Fee 6227 non-null float64 15 Guests Included 10497 non-null int64 16 Extra People 10497 non-null int64 17 Cancellation Policy 10497 non-null float64 18 Features 10497 non-null int64 19 LatLongInteraction 10497 non-null float64 20 BedsPerRoom 10490 non-null float64 21 BathsPerRoom 10497 non-null float64 22 PricePerBed 10482 non-null float64 23 BedBathRooms 10497 non-null float64 24 Stay Duration 10497 non-null int64 dtypes: float64(18), int64(7) memory usage: 2.1 MB
La mayoría de los algoritmos de machine leargning no pueden funcionar si faltan características, así que necesitamos ocuparnos de ello.
# Para imputar los datos faltantes
# La instancia imputer calcula la mediana de cada atributo y almacena el resultado en su variable de instancia statistics_
from sklearn.impute import KNNImputer
imputer = KNNImputer()
columns_to_impute = ['Host Since','Host Listings Count','Beds','Amenities',
'Cleaning Fee','BedsPerRoom','PricePerBed', 'Price']
# Transformamos los datos de entrenamiento sustituyendo valores que faltan por las medianas aprendidas
X_imputed = imputer.fit_transform(data_train[columns_to_impute])
# Convertimos el resultado nuevamente a un DataFrame para mantener la estructura original
data_train_imputed = pd.DataFrame(X_imputed, columns=columns_to_impute,index=data_train.index)
# Actualizamos las columnas originales en data_train con los valores imputados
data_train.update(data_train_imputed)
# Verificamos la imputación
print(data_train[columns_to_impute].isnull().sum())
Host Since 0 Host Listings Count 0 Beds 0 Amenities 0 Cleaning Fee 0 BedsPerRoom 0 PricePerBed 0 Price 0 dtype: int64
# Eliminamos más columnas del modelo (después de Lasso)
columns_to_drop_2 = ['Host Verifications', 'Stay Duration']
data_train = data_train.drop(columns_to_drop_2, axis=1)
# Eliminamos más columnas del modelo (tras el 2º análisis)
columns_to_drop_3 = ['Features','Bed Type','Cancellation Policy','Extra People','Neighbourhood Group Cleansed','Property Type']
data_train = data_train.drop(columns_to_drop_3, axis=1)
# Verificamos que no hay datos faltantes
data_train.info()
<class 'pandas.core.frame.DataFrame'> Index: 10497 entries, 0 to 10608 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Host Since 10497 non-null float64 1 Host Listings Count 10497 non-null float64 2 Neighbourhood Cleansed 10497 non-null float64 3 Room Type 10497 non-null float64 4 Accommodates 10497 non-null int64 5 Bathrooms 10497 non-null float64 6 Bedrooms 10497 non-null float64 7 Beds 10497 non-null float64 8 Amenities 10497 non-null float64 9 Price 10497 non-null float64 10 Cleaning Fee 10497 non-null float64 11 Guests Included 10497 non-null int64 12 LatLongInteraction 10497 non-null float64 13 BedsPerRoom 10497 non-null float64 14 BathsPerRoom 10497 non-null float64 15 PricePerBed 10497 non-null float64 16 BedBathRooms 10497 non-null float64 dtypes: float64(15), int64(2) memory usage: 1.4 MB
# Analizar correlación con los cambios
data = data_train.select_dtypes(include=[np.number])
corr = np.abs(data.drop(['Price'], axis=1).corr())
mask = np.zeros_like(corr, dtype = bool)
mask[np.triu_indices_from(mask)] = True
f, ax = plt.subplots(figsize=(12, 10))
sns.heatmap(corr, mask=mask,vmin = 0.0, vmax=1.0, center=0.5,
linewidths=.1, cmap="YlGnBu", cbar_kws={"shrink": .8})
plt.show()
corr_matrix = data_train.select_dtypes(include=[np.number]).corr()
print(corr_matrix["Price"].sort_values(ascending=False))
Price 1.000000 Cleaning Fee 0.689744 Accommodates 0.572874 BedBathRooms 0.567828 PricePerBed 0.560153 Bedrooms 0.519410 Beds 0.474187 Room Type 0.469757 Guests Included 0.374546 Neighbourhood Cleansed 0.354200 Bathrooms 0.347137 Amenities 0.213631 Host Listings Count 0.191909 BedsPerRoom 0.087661 Host Since 0.080965 LatLongInteraction -0.044289 BathsPerRoom -0.162766 Name: Price, dtype: float64
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')
data_train.info()
<class 'pandas.core.frame.DataFrame'> Index: 10497 entries, 0 to 10608 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Host Since 10497 non-null float64 1 Host Listings Count 10497 non-null float64 2 Neighbourhood Cleansed 10497 non-null float64 3 Room Type 10497 non-null float64 4 Accommodates 10497 non-null int64 5 Bathrooms 10497 non-null float64 6 Bedrooms 10497 non-null float64 7 Beds 10497 non-null float64 8 Amenities 10497 non-null float64 9 Price 10497 non-null float64 10 Cleaning Fee 10497 non-null float64 11 Guests Included 10497 non-null int64 12 LatLongInteraction 10497 non-null float64 13 BedsPerRoom 10497 non-null float64 14 BathsPerRoom 10497 non-null float64 15 PricePerBed 10497 non-null float64 16 BedBathRooms 10497 non-null float64 dtypes: float64(15), int64(2) memory usage: 1.4 MB
Dividir en x y en Y:
# Ver el índice de la columna 'Price'
price_index = data_train.columns.get_loc('Price')
price_index
9
from sklearn import preprocessing
data = data_train.values
y_train = data[:, price_index] # Extraemos la columna 'Price' usando su índice numérico
X_train = np.delete(data, price_index, axis=1) # Eliminamos price_index de la matriz
feature_names = np.delete(data_train.columns.values, price_index)
# Escalamos (con los datos de train)
scaler = preprocessing.StandardScaler().fit(X_train)
XtrainScaled = scaler.transform(X_train)
feature_names
array(['Host Since', 'Host Listings Count', 'Neighbourhood Cleansed',
'Room Type', 'Accommodates', 'Bathrooms', 'Bedrooms', 'Beds',
'Amenities', 'Cleaning Fee', 'Guests Included',
'LatLongInteraction', 'BedsPerRoom', 'BathsPerRoom', 'PricePerBed',
'BedBathRooms'], dtype=object)
data_test = pd.read_csv('./test.csv', sep=';', decimal='.')
# Añadir nuevas características
data_test['LatLongInteraction'] = data_test['Latitude'] * data_test['Longitude']
data_test['BedsPerRoom'] = data_test.apply(lambda row: row['Beds'] / row['Bedrooms'] if row['Bedrooms'] > 0 else 0, axis=1)
data_test['BathsPerRoom'] = data_test.apply(lambda row: row['Bathrooms'] / row['Bedrooms'] if row['Bedrooms'] > 0 else 0, axis=1)
data_test['PricePerBed'] = data_test['Price'] / data_test['Beds']
data_test['BedBathRooms'] = data_test['Bedrooms'] * data_test['Bathrooms']
data_test['Stay Duration'] = data_test.apply(lambda row: classify_stay(row['Minimum Nights'], row['Maximum Nights']), axis=1)
# Filtramos outliers
data_test = filter_data(data_test)
# Eliminamos columnas
columns_to_drop = [
'ID', 'Listing Url', 'Scrape ID', 'Last Scraped', 'Calendar last Scraped',
'License', 'Jurisdiction Names', 'Host URL', 'Host About', 'Name', 'Summary',
'Space', 'Description', 'Notes', 'Neighborhood Overview', 'Transit', 'Access',
'Interaction', 'House Rules', 'Thumbnail Url', 'Medium Url', 'Picture Url',
'XL Picture Url', 'Host Thumbnail Url', 'Host Picture Url', 'Host Acceptance Rate',
'Has Availability', 'Host Total Listings Count', 'Geolocation', 'Neighbourhood',
'Host ID', 'Host Name', 'Host Location', 'Host Neighbourhood', 'Host About',
'Host Total Listings Count', 'Experiences Offered', 'Street', 'City', 'State',
'Country Code', 'Country', 'Market', 'Smart Location', 'Latitude', 'Longitude', 'Zipcode',
'Host Response Time', 'Square Feet', 'Weekly Price', 'Monthly Price', 'Security Deposit',
'Availability 30', 'Availability 60', 'Availability 90', 'Availability 365', 'Minimum Nights',
'Maximum Nights', 'Calendar Updated', 'Geolocation', 'Number of Reviews',
'Review Scores Accuracy', 'Review Scores Cleanliness', 'Review Scores Rating',
'Review Scores Checkin', 'Review Scores Communication', 'Review Scores Location',
'Review Scores Value', 'Reviews per Month', 'Calculated host listings count',
'Host Response Rate', 'First Review', 'Last Review'
]
data_test = data_test.drop(columns_to_drop, axis=1)
# Codificación de variables
data_test['Host Since'] = pd.to_datetime(data_test['Host Since'])
data_test['Host Since'] = (pd.Timestamp.now().normalize() - data_test['Host Since']).dt.days
data_test['Host Verifications'] = data_test['Host Verifications'].apply(lambda x: len(x.split(',')) if pd.notna(x) else 0)
data_test['Bed Type'] = (data_test['Bed Type'] == 'Real Bed').astype(int)
data_test['Amenities'] = data_test['Amenities'].dropna().str.split(',').apply(len)
data_test['Features'] = data_test['Features'].dropna().str.split(',').apply(len)
# Mean Encoder
data_test['Property Type'] = group_property_type(data_test)
categorical = ['Neighbourhood Cleansed', 'Neighbourhood Group Cleansed', 'Property Type', 'Room Type']
for c in categorical:
data_test[c] = data_test[c].map(mean_map[c])
# Ordinal Encoder
data_test['Cancellation Policy'] = data_test['Cancellation Policy'].replace(['super_strict_60', 'super_strict_30'], 'super_strict')
categories = [['flexible', 'moderate', 'strict', 'super_strict']]
ordinal_encoder = OrdinalEncoder(categories=categories)
data_test['Cancellation Policy'] = ordinal_encoder.fit_transform(data_test[['Cancellation Policy']])
# KNNImputer
X_imputed = imputer.fit_transform(data_test[columns_to_impute])
data_test_imputed = pd.DataFrame(X_imputed, columns=columns_to_impute,index=data_test.index)
data_test.update(data_test_imputed)
# Eliminamos más columnas del modelo (después de Lasso)
columns_to_drop_2 = ['Host Verifications', 'Stay Duration']
data_test = data_test.drop(columns_to_drop_2, axis=1)
# Eliminamos más columnas del modelo (tras el 2º análisis)
columns_to_drop_3 = ['Features','Bed Type','Cancellation Policy','Extra People','Neighbourhood Group Cleansed','Property Type']
data_test = data_test.drop(columns_to_drop_3, axis=1)
# Verificamos que no hay datos faltantes y que son características numéricas
data_test.info()
<class 'pandas.core.frame.DataFrame'> Index: 2619 entries, 0 to 2652 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Host Since 2619 non-null float64 1 Host Listings Count 2619 non-null float64 2 Neighbourhood Cleansed 2619 non-null float64 3 Room Type 2619 non-null float64 4 Accommodates 2619 non-null int64 5 Bathrooms 2619 non-null float64 6 Bedrooms 2619 non-null float64 7 Beds 2619 non-null float64 8 Amenities 2619 non-null float64 9 Price 2619 non-null float64 10 Cleaning Fee 2619 non-null float64 11 Guests Included 2619 non-null int64 12 LatLongInteraction 2619 non-null float64 13 BedsPerRoom 2619 non-null float64 14 BathsPerRoom 2619 non-null float64 15 PricePerBed 2619 non-null float64 16 BedBathRooms 2619 non-null float64 dtypes: float64(15), int64(2) memory usage: 368.3 KB
# Sacamos los datos de test
data_test_values = data_test.values
y_test = data_test_values[:, price_index]
X_test = np.delete(data_test_values, price_index, axis=1)
feature_names_test = np.delete(data_test.columns.values, price_index)
# Esta normalización/escalado se realiza con el scaler anterior, basado en los datos de training!
XtestScaled = scaler.transform(X_test)
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
def compute_metrics(y_true, y_pred):
metrics = {
'MAE': round(mean_absolute_error(y_true, y_pred),3),
'MSE': round(mean_squared_error(y_true, y_pred),3),
'RMSE': round(mean_squared_error(y_true, y_pred, squared=False),3),
'R2': round(r2_score(y_true, y_pred),3)
}
return metrics
from sklearn.linear_model import Lasso
n_alphas = 20
alphas = np.logspace(-10, 0, n_alphas)
coefs = []
norm2_coefs = []
for a in alphas:
lasso = Lasso(alpha=a).fit(XtrainScaled, y_train)
coefs.append(lasso.coef_)
norm2_coefs.append(np.dot(lasso.coef_,lasso.coef_.T))
plt.figure(figsize=(14, 5))
ax = plt.subplot(1, 2, 1)
ax.plot(alphas, coefs)
ax.set_xscale('log')
ax.set_xlim(ax.get_xlim()[::-1]) # reverse axis
plt.xlabel('alpha')
plt.ylabel('$w_i$')
plt.title('Coeficientes en función de la regularización')
plt.axis('tight')
ax = plt.subplot(1, 2, 2)
ax.plot(alphas, norm2_coefs)
ax.set_xscale('log')
ax.set_xlim(ax.get_xlim()[::-1]) # reverse axis
plt.xlabel('alpha')
plt.ylabel('$||\mathbf{w}||^2_2$')
plt.title('Norma de los coeffs en función de la regularización')
plt.axis('tight')
plt.show()
La primera gráfica muestra cómo los coeficientes individuales del modelo varían con diferentes valores de alpha. En un modelo Lasso, a medida que alpha aumenta, más coeficientes se reducirán a cero. Los coeficientes que caen a cero indican que Lasso ha eliminado esas características del modelo, considerándolas no informativas o redundantes.
La segunda gráfica muestra cómo la suma total de los valores absolutos de los coeficientes disminuye a medida que alpha aumenta, lo que indica que el modelo está siendo más regularizado y, por lo tanto, se vuelve más simple.
Estas gráficas sugieren que el modelo Lasso está haciendo lo que se espera: penaliza los coeficientes, empujándolos hacia cero y posiblemente eliminando características que no contribuyen significativamente a la predicción del modelo.
from sklearn.model_selection import GridSearchCV
alpha_vector = np.logspace(-10,1,25)
param_grid = {'alpha': alpha_vector }
grid = GridSearchCV(Lasso(), scoring= 'neg_mean_squared_error', param_grid=param_grid, cv = 5)
grid.fit(XtrainScaled, y_train)
print("best parameters: {}".format(grid.best_params_))
scores = -1*np.array(grid.cv_results_['mean_test_score'])
plt.semilogx(alpha_vector,scores,'-o')
plt.xlabel('alpha',fontsize=16)
plt.ylabel('5-Fold MSE')
plt.ylim(min(scores) - 1, max(scores) + 1)
sns.despine(top=True, right=True, left=False, bottom=False)
plt.show()
best parameters: {'alpha': 0.01778279410038923}
lasso = Lasso(alpha=0.01778279410038923) # Reemplazar con el valor de alpha anterior
lasso.fit(XtrainScaled, y_train)
# Obtener los coeficientes
lasso_coefs = lasso.coef_
# Identificar las características que el modelo Lasso ha reducido a cero
features_to_remove = [feature for feature, coef in zip(feature_names, lasso_coefs) if coef == 0]
print("Características para eliminar:", features_to_remove)
Características para eliminar: []
print("Train: ",lasso.score(XtrainScaled,y_train))
print("Test: ",lasso.score(XtestScaled,y_test))
Train: 0.8154816001821197 Test: 0.8252591851752692
y_pred = lasso.predict(XtestScaled)
# Métricas
metrics = compute_metrics(y_test, y_pred)
for metric_name, metric_value in metrics.items():
print(f"{metric_name}: {metric_value}")
# Plotting the predicted vs actual values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2) # Diagonal line
plt.xlabel('Valores Reales')
plt.ylabel('Valores Predichos')
plt.title('Lasso Regression: Predicción vs Real')
plt.show()
MAE: 12.408 MSE: 505.83 RMSE: 22.491 R2: 0.825
MAE (Mean Absolute Error): Si los precios oscilan entre 0 y 600, un MAE de 12.57 podría considerarse razonablemente bajo.
MSE (Mean Squared Error): Similar al MAE pero penaliza más los errores grandes. Un MSE de 522.15 es bastante más alto que el MAE debido a esta penalización. Si la mayoría de los precios están cerca del límite inferior del rango (cerca de 0), un MSE de más de 500 podría ser muy alto.
RMSE (Root Mean Squared Error): Un RMSE de 22.85 indica que, en promedio, las predicciones del modelo se desvían aproximadamente 22.85 unidades del valor real, lo cual proporciona una mejor idea de la magnitud de los errores que el MSE debido a que está en las mismas unidades que los precios.
R2 (R-squared): El R2 es una medida de qué tan bien las predicciones del modelo se correlacionan con los valores reales. Un valor de 0.819 (o 81.9%) es bastante alto, lo que sugiere que este modelo explica alrededor del 81.9% de la variabilidad en el precio. Un valor alto de R2 no necesariamente significa que el modelo es bueno para hacer predicciones precisas en datos nuevos.
# ElasticNetCV - maneja la multicolinealidad mejor que Lasso
from sklearn.linear_model import ElasticNetCV
enet = ElasticNetCV(cv=5, random_state=0).fit(XtrainScaled, y_train)
y_pred_enet = enet.predict(XtestScaled)
# Métricas
metrics = compute_metrics(y_test, y_pred)
for metric_name, metric_value in metrics.items():
print(f"{metric_name}: {metric_value}")
MAE: 12.408 MSE: 505.83 RMSE: 22.491 R2: 0.825
print("Train: ",enet.score(XtrainScaled,y_train))
print("Test: ",enet.score(XtestScaled,y_test))
Train: 0.8156313565387212 Test: 0.825079800905857
Dado que ElasticNet equilibra entre Lasso y Ridge, observamos que se obtiene un conjunto de métricas igual al de Lasso. Esto podría explicarse porque los datos son más adecuados para la penalización L1 o porque el mejor parámetro de mezcla encontrado en la validación cruzada está más sesgado hacia Lasso.
from sklearn.linear_model import LinearRegression
# Entrenar el modelo de regresión lineal
lr = LinearRegression().fit(XtrainScaled, y_train)
# Hacer predicciones en el conjunto de prueba
y_pred = lr.predict(XtestScaled)
# Métricas
metrics = compute_metrics(y_test, y_pred)
for metric_name, metric_value in metrics.items():
print(f"{metric_name}: {metric_value}")
# Mostrar los coeficientes del modelo
print('Intercepto (w0):', lr.intercept_)
print('Coeficientes (w1, w2, ..., wn):', lr.coef_)
# Calcular la norma L2 al cuadrado de los coeficientes (sin incluir el intercepto)
norm_w2 = np.dot(lr.coef_,lr.coef_.T)
print(f'\n||w||_2^2 = {norm_w2:.2g}')
MAE: 12.415 MSE: 505.786 RMSE: 22.49 R2: 0.825 Intercepto (w0): 65.7782414023054 Coeficientes (w1, w2, ..., wn): [ -1.07012948 -0.31977824 2.93189686 1.32212255 2.40605079 9.20571351 5.26664076 7.53438872 0.61977756 13.39736857 2.54196563 0.56047219 5.18524856 -10.46423407 28.82881597 7.73956604] ||w||_2^2 = 1.4e+03
print("Train: ",lr.score(XtrainScaled,y_train))
print("Test: ",lr.score(XtestScaled,y_test))
Train: 0.8154845311576571 Test: 0.8252743029936962
# Gráfico de predicciones vs valores reales
plt.scatter(y_test, y_pred)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
plt.xlabel('Valores Reales')
plt.ylabel('Predicciones')
plt.title(f'Linear Regression: Valores Reales vs. Predicciones')
sns.despine(top=True, right=True, left=False, bottom=False)
plt.show()
residuos = y_test - y_pred
plt.figure(figsize=(10,6))
plt.scatter(y_pred, residuos, alpha=0.5)
plt.hlines(y=0, xmin=y_pred.min(), xmax=y_pred.max(), colors='red', linestyles='--')
plt.xlabel('Valores Predichos')
plt.ylabel('Residuos')
plt.title('Gráfico de Residuos')
sns.despine(top=True, right=True, left=False, bottom=False)
plt.show()
La gráfica de residuos muestra una dispersión uniforme alrededor de la línea de cero, lo cual es un indicador positivo de predicciones consistentes. Sin embargo, hay una posible tendencia de aumento en la varianza de los residuos a medida que los valores predichos se incrementan, indicando heterocedasticidad en los errores del modelo. Esto sugiere que el modelo podría mejorar con transformaciones adecuadas en los datos o técnicas de modelado que ajusten los residuos de manera diferencial.
# Curva de Aprendizaje - Traza el error de validación cruzada en función del tamaño del conjunto de entrenamiento
from sklearn.model_selection import learning_curve
train_sizes, train_scores, valid_scores = learning_curve(
LinearRegression(), XtrainScaled, y_train, train_sizes=np.linspace(0.01, 1.0, 40), cv=5,
scoring="neg_root_mean_squared_error")
train_errors = -train_scores.mean(axis=1)
valid_errors = -valid_scores.mean(axis=1)
plt.plot(train_sizes, train_errors, "r-+", linewidth=2, label="train")
plt.plot(train_sizes, valid_errors, "b-", linewidth=2, label="valid")
plt.xlabel('Tamaño del conjunto de entrenamiento')
plt.ylabel('MSE')
plt.title('Curva de aprendizaje')
plt.legend()
sns.despine(top=True, right=True, left=False, bottom=False)
plt.show()
Esta curva de aprendizaje nos permite determinar si agregar más datos mejorará el rendimiento del modelo. Si las curvas se estabilizan y la diferencia entre ellas no es significativa (lo que indicaría un buen equilibrio entre sesgo y varianza), agregar más datos puede no ser útil. En su lugar, podríamos pensar en aumentar la complejidad del modelo o mejorar la calidad de los datos actuales.
from sklearn.tree import DecisionTreeRegressor
maxDepth = range(1,12)
param_grid = {'max_depth': maxDepth }
grid = GridSearchCV(DecisionTreeRegressor(random_state=0), param_grid=param_grid, cv = 3, verbose=2)
grid.fit(XtrainScaled, y_train)
print("best mean cross-validation score: {:.3f}".format(grid.best_score_))
print("best parameters: {}".format(grid.best_params_))
scores = np.array(grid.cv_results_['mean_test_score'])
plt.plot(maxDepth,scores,'-o')
plt.xlabel('max_depth',fontsize=16)
plt.ylabel('10-Fold MSE')
#plt.ylim((-1, 0))
sns.despine(top=True, right=True, left=False, bottom=False)
plt.show()
Fitting 3 folds for each of 11 candidates, totalling 33 fits
[CV] END ........................................max_depth=1; total time= 0.0s
[CV] END ........................................max_depth=1; total time= 0.0s
[CV] END ........................................max_depth=1; total time= 0.0s
[CV] END ........................................max_depth=2; total time= 0.0s
[CV] END ........................................max_depth=2; total time= 0.0s
[CV] END ........................................max_depth=2; total time= 0.0s
[CV] END ........................................max_depth=3; total time= 0.0s
[CV] END ........................................max_depth=3; total time= 0.0s
[CV] END ........................................max_depth=3; total time= 0.0s
[CV] END ........................................max_depth=4; total time= 0.0s
[CV] END ........................................max_depth=4; total time= 0.0s
[CV] END ........................................max_depth=4; total time= 0.0s
[CV] END ........................................max_depth=5; total time= 0.0s
[CV] END ........................................max_depth=5; total time= 0.0s
[CV] END ........................................max_depth=5; total time= 0.0s
[CV] END ........................................max_depth=6; total time= 0.0s
[CV] END ........................................max_depth=6; total time= 0.0s
[CV] END ........................................max_depth=6; total time= 0.0s
[CV] END ........................................max_depth=7; total time= 0.0s
[CV] END ........................................max_depth=7; total time= 0.0s
[CV] END ........................................max_depth=7; total time= 0.0s
[CV] END ........................................max_depth=8; total time= 0.0s
[CV] END ........................................max_depth=8; total time= 0.0s
[CV] END ........................................max_depth=8; total time= 0.0s
[CV] END ........................................max_depth=9; total time= 0.0s
[CV] END ........................................max_depth=9; total time= 0.0s
[CV] END ........................................max_depth=9; total time= 0.0s
[CV] END .......................................max_depth=10; total time= 0.0s
[CV] END .......................................max_depth=10; total time= 0.0s
[CV] END .......................................max_depth=10; total time= 0.0s
[CV] END .......................................max_depth=11; total time= 0.1s
[CV] END .......................................max_depth=11; total time= 0.1s
[CV] END .......................................max_depth=11; total time= 0.1s
best mean cross-validation score: 0.855
best parameters: {'max_depth': 11}
from sklearn import tree
treeModel = DecisionTreeRegressor(max_depth=10).fit(XtrainScaled, y_train)
fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(treeModel, feature_names=list(feature_names), filled=True)
importances = treeModel.feature_importances_
importances = importances / np.max(importances)
indices = np.argsort(importances)[::-1]
sns.set_theme(style="white")
plt.figure(figsize=(8,5))
plt.barh(range(XtrainScaled.shape[1]),importances[indices])
plt.yticks(range(XtrainScaled.shape[1]),feature_names[indices])
sns.despine(top=True, right=True, left=False, bottom=False)
plt.show()
# Evaluación del Árbol de Decisión
maxDepthOptimo = 11
treeModel = DecisionTreeRegressor(max_depth=maxDepthOptimo).fit(XtrainScaled,y_train)
print("Train: ",treeModel.score(XtrainScaled,y_train))
print("Test: ",treeModel.score(XtestScaled,y_test))
Train: 0.9971661476137205 Test: 0.9565759431334919
y_pred = treeModel.predict(XtestScaled)
# Métricas
metrics = compute_metrics(y_test, y_pred)
for metric_name, metric_value in metrics.items():
print(f"{metric_name}: {metric_value}")
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4) # Línea de perfecta predicción
plt.xlabel('Valores Reales')
plt.ylabel('Predicciones')
plt.title('Tree Model: Valores Reales vs. Predicciones')
sns.despine(top=True, right=True, left=False, bottom=False)
plt.show()
MAE: 2.72 MSE: 110.83 RMSE: 10.528 R2: 0.962
# Curva de Aprendizaje
train_sizes, train_scores, valid_scores = learning_curve(
treeModel, XtrainScaled, y_train, train_sizes=np.linspace(0.01, 1.0, 40), cv=5,
scoring="neg_root_mean_squared_error")
train_errors = -train_scores.mean(axis=1)
valid_errors = -valid_scores.mean(axis=1)
plt.plot(train_sizes, train_errors, "r-+", linewidth=2, label="train")
plt.plot(train_sizes, valid_errors, "b-", linewidth=2, label="valid")
plt.xlabel('Tamaño del conjunto de entrenamiento')
plt.ylabel('RMSE')
plt.title('Curva de aprendizaje')
plt.legend()
sns.despine(top=True, right=True, left=False, bottom=False)
plt.show()
Un MAE de 3.478 es bastante bajo (en promedio, este modelo se equivoca por aproximadamente 3.478 unidades en sus predicciones). Un MSE de 96.548 y un RMSE de 9.826 indican la calidad de los ajustes del modelo y la dispersión de los residuos. Dado que el RMSE es la raíz cuadrada del MSE, proporciona una medida de error en las mismas unidades que la variable objetivo. Un R2 de 0.967 es muy alto, lo que implica que este modelo explica aproximadamente el 96.7% de la variabilidad en los datos de prueba.
La diferencia entre las puntuaciones de entrenamiento (0.991) y prueba (0.966) no es muy grande, lo que sugiere que no hay un sobreajuste significativo.
from sklearn.ensemble import RandomForestRegressor
maxDepth = range(1,15)
tuned_parameters = {'max_depth': maxDepth}
grid = GridSearchCV(
RandomForestRegressor(random_state=0, n_estimators=200, max_features='sqrt'),
param_grid=tuned_parameters, cv=3, verbose=2)
grid.fit(XtrainScaled, y_train)
print("best mean cross-validation score: {:.3f}".format(grid.best_score_))
print("best parameters: {}".format(grid.best_params_))
scores = np.array(grid.cv_results_['mean_test_score'])
plt.plot(maxDepth,scores,'-o')
plt.xlabel('max_depth')
plt.ylabel('10-fold ACC')
sns.despine(top=True, right=True, left=False, bottom=False)
plt.show()
Fitting 3 folds for each of 14 candidates, totalling 42 fits
[CV] END ........................................max_depth=1; total time= 0.3s
[CV] END ........................................max_depth=1; total time= 0.3s
[CV] END ........................................max_depth=1; total time= 0.3s
[CV] END ........................................max_depth=2; total time= 0.5s
[CV] END ........................................max_depth=2; total time= 0.5s
[CV] END ........................................max_depth=2; total time= 0.5s
[CV] END ........................................max_depth=3; total time= 0.6s
[CV] END ........................................max_depth=3; total time= 0.6s
[CV] END ........................................max_depth=3; total time= 0.6s
[CV] END ........................................max_depth=4; total time= 0.7s
[CV] END ........................................max_depth=4; total time= 0.7s
[CV] END ........................................max_depth=4; total time= 0.7s
[CV] END ........................................max_depth=5; total time= 0.9s
[CV] END ........................................max_depth=5; total time= 0.9s
[CV] END ........................................max_depth=5; total time= 1.0s
[CV] END ........................................max_depth=6; total time= 1.0s
[CV] END ........................................max_depth=6; total time= 1.1s
[CV] END ........................................max_depth=6; total time= 1.0s
[CV] END ........................................max_depth=7; total time= 1.2s
[CV] END ........................................max_depth=7; total time= 1.2s
[CV] END ........................................max_depth=7; total time= 1.2s
[CV] END ........................................max_depth=8; total time= 1.4s
[CV] END ........................................max_depth=8; total time= 1.3s
[CV] END ........................................max_depth=8; total time= 1.3s
[CV] END ........................................max_depth=9; total time= 1.4s
[CV] END ........................................max_depth=9; total time= 1.5s
[CV] END ........................................max_depth=9; total time= 1.4s
[CV] END .......................................max_depth=10; total time= 1.6s
[CV] END .......................................max_depth=10; total time= 1.6s
[CV] END .......................................max_depth=10; total time= 1.6s
[CV] END .......................................max_depth=11; total time= 1.8s
[CV] END .......................................max_depth=11; total time= 1.7s
[CV] END .......................................max_depth=11; total time= 1.7s
[CV] END .......................................max_depth=12; total time= 1.9s
[CV] END .......................................max_depth=12; total time= 1.9s
[CV] END .......................................max_depth=12; total time= 1.9s
[CV] END .......................................max_depth=13; total time= 2.0s
[CV] END .......................................max_depth=13; total time= 2.1s
[CV] END .......................................max_depth=13; total time= 2.0s
[CV] END .......................................max_depth=14; total time= 2.1s
[CV] END .......................................max_depth=14; total time= 2.2s
[CV] END .......................................max_depth=14; total time= 2.1s
best mean cross-validation score: 0.898
best parameters: {'max_depth': 14}
maxDepthOptimo = grid.best_params_['max_depth']
randomForest = RandomForestRegressor(max_depth=maxDepthOptimo,n_estimators=200,max_features='sqrt').fit(XtrainScaled,y_train)
importances = randomForest.feature_importances_
importances = importances / np.max(importances)
indices = np.argsort(importances)[::-1]
sns.set_theme(style="white")
plt.figure(figsize=(8,5))
plt.barh(range(XtrainScaled.shape[1]),importances[indices])
plt.yticks(range(XtrainScaled.shape[1]),feature_names[indices])
sns.despine(top=True, right=True, left=False, bottom=False)
plt.show()
for score, name in zip(randomForest.feature_importances_, feature_names):
print(round(score,2), name)
0.02 Host Since 0.01 Host Listings Count 0.03 Neighbourhood Cleansed 0.07 Room Type 0.07 Accommodates 0.05 Bathrooms 0.08 Bedrooms 0.05 Beds 0.01 Amenities 0.19 Cleaning Fee 0.02 Guests Included 0.02 LatLongInteraction 0.02 BedsPerRoom 0.01 BathsPerRoom 0.23 PricePerBed 0.13 BedBathRooms
# Evaluación de Random Forest
maxDepthOptimo = grid.best_params_['max_depth']
randomForest = RandomForestRegressor(max_depth=maxDepthOptimo,n_estimators=200,max_features='sqrt').fit(XtrainScaled,y_train)
print("Train: ",randomForest.score(XtrainScaled,y_train))
print("Test: ",randomForest.score(XtestScaled,y_test))
Train: 0.9831833465915698 Test: 0.9438753933446635
y_pred = randomForest.predict(XtestScaled)
# Métricas
metrics = compute_metrics(y_test, y_pred)
for metric_name, metric_value in metrics.items():
print(f"{metric_name}: {metric_value}")
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4) # Línea de perfecta predicción
plt.xlabel('Valores Reales')
plt.ylabel('Predicciones')
plt.title(f'Valores Reales vs. Predicciones')
sns.despine(top=True, right=True, left=False, bottom=False)
plt.show()
MAE: 5.818 MSE: 162.466 RMSE: 12.746 R2: 0.944
# Curva de Aprendizaje
train_sizes, train_scores, valid_scores = learning_curve(
randomForest, XtrainScaled, y_train, train_sizes=np.linspace(0.01, 1.0, 40), cv=5,
scoring="neg_root_mean_squared_error")
train_errors = -train_scores.mean(axis=1)
valid_errors = -valid_scores.mean(axis=1)
plt.plot(train_sizes, train_errors, "r-+", linewidth=2, label="train")
plt.plot(train_sizes, valid_errors, "b-", linewidth=2, label="valid")
plt.xlabel('Tamaño del conjunto de entrenamiento')
plt.ylabel('RMSE')
plt.title('Curva de aprendizaje')
plt.legend()
sns.despine(top=True, right=True, left=False, bottom=False)
plt.show()
A medida que se añaden más datos, tanto el error de entrenamiento como el de validación disminuyen. El hecho de que el error de validación disminuya y se estabilice a medida que se usan más datos de entrenamiento es una buena señal. La diferencia entre las curvas de entrenamiento y validación parece razonable, lo que indica que el modelo generaliza bien. Sin embargo, un error de validación consistentemente más alto que el error de entrenamiento puede sugerir un sobreajuste leve.
from sklearn.ensemble import BaggingRegressor
maxDepth = range(1,15)
tuned_parameters = {'base_estimator__max_depth': maxDepth}
grid = GridSearchCV(
BaggingRegressor(estimator=DecisionTreeRegressor(), random_state=0, n_estimators=200),
param_grid=tuned_parameters, cv=3, verbose=2)
grid.fit(XtrainScaled, y_train)
print("best mean cross-validation score: {:.3f}".format(grid.best_score_))
print("best parameters: {}".format(grid.best_params_))
scores = np.array(grid.cv_results_['mean_test_score'])
plt.plot(maxDepth,scores,'-o')
plt.xlabel('max_depth')
plt.ylabel('10-fold ACC')
sns.despine(top=True, right=True, left=False, bottom=False)
plt.show()
Fitting 3 folds for each of 14 candidates, totalling 42 fits
/Users/rociobenitezgarcia/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:720: FutureWarning: Parameter 'base_estimator' of BaggingRegressor is deprecated in favor of 'estimator'. See BaggingRegressor's docstring for more details. estimator = estimator.set_params(**cloned_parameters)
[CV] END ........................base_estimator__max_depth=1; total time= 0.8s
/Users/rociobenitezgarcia/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:720: FutureWarning: Parameter 'base_estimator' of BaggingRegressor is deprecated in favor of 'estimator'. See BaggingRegressor's docstring for more details. estimator = estimator.set_params(**cloned_parameters)
[CV] END ........................base_estimator__max_depth=1; total time= 0.9s
/Users/rociobenitezgarcia/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:720: FutureWarning: Parameter 'base_estimator' of BaggingRegressor is deprecated in favor of 'estimator'. See BaggingRegressor's docstring for more details. estimator = estimator.set_params(**cloned_parameters)
[CV] END ........................base_estimator__max_depth=1; total time= 0.8s
/Users/rociobenitezgarcia/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:720: FutureWarning: Parameter 'base_estimator' of BaggingRegressor is deprecated in favor of 'estimator'. See BaggingRegressor's docstring for more details. estimator = estimator.set_params(**cloned_parameters)
[CV] END ........................base_estimator__max_depth=2; total time= 1.3s
/Users/rociobenitezgarcia/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:720: FutureWarning: Parameter 'base_estimator' of BaggingRegressor is deprecated in favor of 'estimator'. See BaggingRegressor's docstring for more details. estimator = estimator.set_params(**cloned_parameters)
[CV] END ........................base_estimator__max_depth=2; total time= 1.3s
/Users/rociobenitezgarcia/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:720: FutureWarning: Parameter 'base_estimator' of BaggingRegressor is deprecated in favor of 'estimator'. See BaggingRegressor's docstring for more details. estimator = estimator.set_params(**cloned_parameters)
[CV] END ........................base_estimator__max_depth=2; total time= 1.3s
/Users/rociobenitezgarcia/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:720: FutureWarning: Parameter 'base_estimator' of BaggingRegressor is deprecated in favor of 'estimator'. See BaggingRegressor's docstring for more details. estimator = estimator.set_params(**cloned_parameters)
[CV] END ........................base_estimator__max_depth=3; total time= 1.9s
/Users/rociobenitezgarcia/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:720: FutureWarning: Parameter 'base_estimator' of BaggingRegressor is deprecated in favor of 'estimator'. See BaggingRegressor's docstring for more details. estimator = estimator.set_params(**cloned_parameters)
[CV] END ........................base_estimator__max_depth=3; total time= 1.9s
/Users/rociobenitezgarcia/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:720: FutureWarning: Parameter 'base_estimator' of BaggingRegressor is deprecated in favor of 'estimator'. See BaggingRegressor's docstring for more details. estimator = estimator.set_params(**cloned_parameters)
[CV] END ........................base_estimator__max_depth=3; total time= 1.9s
/Users/rociobenitezgarcia/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:720: FutureWarning: Parameter 'base_estimator' of BaggingRegressor is deprecated in favor of 'estimator'. See BaggingRegressor's docstring for more details. estimator = estimator.set_params(**cloned_parameters)
[CV] END ........................base_estimator__max_depth=4; total time= 2.4s
/Users/rociobenitezgarcia/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:720: FutureWarning: Parameter 'base_estimator' of BaggingRegressor is deprecated in favor of 'estimator'. See BaggingRegressor's docstring for more details. estimator = estimator.set_params(**cloned_parameters)
[CV] END ........................base_estimator__max_depth=4; total time= 2.5s
/Users/rociobenitezgarcia/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:720: FutureWarning: Parameter 'base_estimator' of BaggingRegressor is deprecated in favor of 'estimator'. See BaggingRegressor's docstring for more details. estimator = estimator.set_params(**cloned_parameters)
[CV] END ........................base_estimator__max_depth=4; total time= 2.6s
/Users/rociobenitezgarcia/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:720: FutureWarning: Parameter 'base_estimator' of BaggingRegressor is deprecated in favor of 'estimator'. See BaggingRegressor's docstring for more details. estimator = estimator.set_params(**cloned_parameters)
[CV] END ........................base_estimator__max_depth=5; total time= 3.1s
/Users/rociobenitezgarcia/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:720: FutureWarning: Parameter 'base_estimator' of BaggingRegressor is deprecated in favor of 'estimator'. See BaggingRegressor's docstring for more details. estimator = estimator.set_params(**cloned_parameters)
[CV] END ........................base_estimator__max_depth=5; total time= 2.9s
/Users/rociobenitezgarcia/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:720: FutureWarning: Parameter 'base_estimator' of BaggingRegressor is deprecated in favor of 'estimator'. See BaggingRegressor's docstring for more details. estimator = estimator.set_params(**cloned_parameters)
[CV] END ........................base_estimator__max_depth=5; total time= 2.9s
/Users/rociobenitezgarcia/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:720: FutureWarning: Parameter 'base_estimator' of BaggingRegressor is deprecated in favor of 'estimator'. See BaggingRegressor's docstring for more details. estimator = estimator.set_params(**cloned_parameters)
[CV] END ........................base_estimator__max_depth=6; total time= 3.5s
/Users/rociobenitezgarcia/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:720: FutureWarning: Parameter 'base_estimator' of BaggingRegressor is deprecated in favor of 'estimator'. See BaggingRegressor's docstring for more details. estimator = estimator.set_params(**cloned_parameters)
[CV] END ........................base_estimator__max_depth=6; total time= 3.4s
/Users/rociobenitezgarcia/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:720: FutureWarning: Parameter 'base_estimator' of BaggingRegressor is deprecated in favor of 'estimator'. See BaggingRegressor's docstring for more details. estimator = estimator.set_params(**cloned_parameters)
[CV] END ........................base_estimator__max_depth=6; total time= 3.4s
/Users/rociobenitezgarcia/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:720: FutureWarning: Parameter 'base_estimator' of BaggingRegressor is deprecated in favor of 'estimator'. See BaggingRegressor's docstring for more details. estimator = estimator.set_params(**cloned_parameters)
[CV] END ........................base_estimator__max_depth=7; total time= 3.9s
/Users/rociobenitezgarcia/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:720: FutureWarning: Parameter 'base_estimator' of BaggingRegressor is deprecated in favor of 'estimator'. See BaggingRegressor's docstring for more details. estimator = estimator.set_params(**cloned_parameters)
[CV] END ........................base_estimator__max_depth=7; total time= 4.0s
/Users/rociobenitezgarcia/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:720: FutureWarning: Parameter 'base_estimator' of BaggingRegressor is deprecated in favor of 'estimator'. See BaggingRegressor's docstring for more details. estimator = estimator.set_params(**cloned_parameters)
[CV] END ........................base_estimator__max_depth=7; total time= 3.9s
/Users/rociobenitezgarcia/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:720: FutureWarning: Parameter 'base_estimator' of BaggingRegressor is deprecated in favor of 'estimator'. See BaggingRegressor's docstring for more details. estimator = estimator.set_params(**cloned_parameters)
[CV] END ........................base_estimator__max_depth=8; total time= 4.4s
/Users/rociobenitezgarcia/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:720: FutureWarning: Parameter 'base_estimator' of BaggingRegressor is deprecated in favor of 'estimator'. See BaggingRegressor's docstring for more details. estimator = estimator.set_params(**cloned_parameters)
[CV] END ........................base_estimator__max_depth=8; total time= 4.4s
/Users/rociobenitezgarcia/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:720: FutureWarning: Parameter 'base_estimator' of BaggingRegressor is deprecated in favor of 'estimator'. See BaggingRegressor's docstring for more details. estimator = estimator.set_params(**cloned_parameters)
[CV] END ........................base_estimator__max_depth=8; total time= 4.7s
/Users/rociobenitezgarcia/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:720: FutureWarning: Parameter 'base_estimator' of BaggingRegressor is deprecated in favor of 'estimator'. See BaggingRegressor's docstring for more details. estimator = estimator.set_params(**cloned_parameters)
[CV] END ........................base_estimator__max_depth=9; total time= 5.0s
/Users/rociobenitezgarcia/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:720: FutureWarning: Parameter 'base_estimator' of BaggingRegressor is deprecated in favor of 'estimator'. See BaggingRegressor's docstring for more details. estimator = estimator.set_params(**cloned_parameters)
[CV] END ........................base_estimator__max_depth=9; total time= 4.9s
/Users/rociobenitezgarcia/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:720: FutureWarning: Parameter 'base_estimator' of BaggingRegressor is deprecated in favor of 'estimator'. See BaggingRegressor's docstring for more details. estimator = estimator.set_params(**cloned_parameters)
[CV] END ........................base_estimator__max_depth=9; total time= 5.2s
/Users/rociobenitezgarcia/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:720: FutureWarning: Parameter 'base_estimator' of BaggingRegressor is deprecated in favor of 'estimator'. See BaggingRegressor's docstring for more details. estimator = estimator.set_params(**cloned_parameters)
[CV] END .......................base_estimator__max_depth=10; total time= 5.5s
/Users/rociobenitezgarcia/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:720: FutureWarning: Parameter 'base_estimator' of BaggingRegressor is deprecated in favor of 'estimator'. See BaggingRegressor's docstring for more details. estimator = estimator.set_params(**cloned_parameters)
[CV] END .......................base_estimator__max_depth=10; total time= 5.5s
/Users/rociobenitezgarcia/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:720: FutureWarning: Parameter 'base_estimator' of BaggingRegressor is deprecated in favor of 'estimator'. See BaggingRegressor's docstring for more details. estimator = estimator.set_params(**cloned_parameters)
[CV] END .......................base_estimator__max_depth=10; total time= 5.5s
/Users/rociobenitezgarcia/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:720: FutureWarning: Parameter 'base_estimator' of BaggingRegressor is deprecated in favor of 'estimator'. See BaggingRegressor's docstring for more details. estimator = estimator.set_params(**cloned_parameters)
[CV] END .......................base_estimator__max_depth=11; total time= 5.7s
/Users/rociobenitezgarcia/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:720: FutureWarning: Parameter 'base_estimator' of BaggingRegressor is deprecated in favor of 'estimator'. See BaggingRegressor's docstring for more details. estimator = estimator.set_params(**cloned_parameters)
[CV] END .......................base_estimator__max_depth=11; total time= 5.9s
/Users/rociobenitezgarcia/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:720: FutureWarning: Parameter 'base_estimator' of BaggingRegressor is deprecated in favor of 'estimator'. See BaggingRegressor's docstring for more details. estimator = estimator.set_params(**cloned_parameters)
[CV] END .......................base_estimator__max_depth=11; total time= 5.7s
/Users/rociobenitezgarcia/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:720: FutureWarning: Parameter 'base_estimator' of BaggingRegressor is deprecated in favor of 'estimator'. See BaggingRegressor's docstring for more details. estimator = estimator.set_params(**cloned_parameters)
[CV] END .......................base_estimator__max_depth=12; total time= 6.1s
/Users/rociobenitezgarcia/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:720: FutureWarning: Parameter 'base_estimator' of BaggingRegressor is deprecated in favor of 'estimator'. See BaggingRegressor's docstring for more details. estimator = estimator.set_params(**cloned_parameters)
[CV] END .......................base_estimator__max_depth=12; total time= 6.1s
/Users/rociobenitezgarcia/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:720: FutureWarning: Parameter 'base_estimator' of BaggingRegressor is deprecated in favor of 'estimator'. See BaggingRegressor's docstring for more details. estimator = estimator.set_params(**cloned_parameters)
[CV] END .......................base_estimator__max_depth=12; total time= 5.9s
/Users/rociobenitezgarcia/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:720: FutureWarning: Parameter 'base_estimator' of BaggingRegressor is deprecated in favor of 'estimator'. See BaggingRegressor's docstring for more details. estimator = estimator.set_params(**cloned_parameters)
[CV] END .......................base_estimator__max_depth=13; total time= 6.4s
/Users/rociobenitezgarcia/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:720: FutureWarning: Parameter 'base_estimator' of BaggingRegressor is deprecated in favor of 'estimator'. See BaggingRegressor's docstring for more details. estimator = estimator.set_params(**cloned_parameters)
[CV] END .......................base_estimator__max_depth=13; total time= 6.4s
/Users/rociobenitezgarcia/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:720: FutureWarning: Parameter 'base_estimator' of BaggingRegressor is deprecated in favor of 'estimator'. See BaggingRegressor's docstring for more details. estimator = estimator.set_params(**cloned_parameters)
[CV] END .......................base_estimator__max_depth=13; total time= 6.3s
/Users/rociobenitezgarcia/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:720: FutureWarning: Parameter 'base_estimator' of BaggingRegressor is deprecated in favor of 'estimator'. See BaggingRegressor's docstring for more details. estimator = estimator.set_params(**cloned_parameters)
[CV] END .......................base_estimator__max_depth=14; total time= 6.4s
/Users/rociobenitezgarcia/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:720: FutureWarning: Parameter 'base_estimator' of BaggingRegressor is deprecated in favor of 'estimator'. See BaggingRegressor's docstring for more details. estimator = estimator.set_params(**cloned_parameters)
[CV] END .......................base_estimator__max_depth=14; total time= 6.5s
/Users/rociobenitezgarcia/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py:720: FutureWarning: Parameter 'base_estimator' of BaggingRegressor is deprecated in favor of 'estimator'. See BaggingRegressor's docstring for more details. estimator = estimator.set_params(**cloned_parameters)
[CV] END .......................base_estimator__max_depth=14; total time= 6.2s
/Users/rociobenitezgarcia/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_search.py:929: FutureWarning: Parameter 'base_estimator' of BaggingRegressor is deprecated in favor of 'estimator'. See BaggingRegressor's docstring for more details. clone(base_estimator).set_params(**self.best_params_)
best mean cross-validation score: 0.931
best parameters: {'base_estimator__max_depth': 14}
# Evaluación de Baging Regressor
maxDepthOptimo = grid.best_params_['base_estimator__max_depth']
#maxDepthOptimo = 9
baggingModel = BaggingRegressor(estimator=DecisionTreeRegressor(max_depth=maxDepthOptimo),n_estimators=200).fit(XtrainScaled,y_train)
print("Train: ", baggingModel.score(XtrainScaled,y_train))
print("Test: ", baggingModel.score(XtestScaled,y_test))
Train: 0.992663296332992 Test: 0.9752735476497003
importances = np.mean([tree.feature_importances_ for tree in baggingModel.estimators_], axis=0)
importances = importances / np.max(importances)
indices = np.argsort(importances)[::-1]
sns.set_theme(style="white")
plt.figure(figsize=(8,5))
plt.barh(range(XtrainScaled.shape[1]),importances[indices])
plt.yticks(range(XtrainScaled.shape[1]),feature_names[indices])
sns.despine(top=True, right=True, left=False, bottom=False)
plt.show()
y_pred = baggingModel.predict(XtestScaled)
# Métricas
metrics = compute_metrics(y_test, y_pred)
for metric_name, metric_value in metrics.items():
print(f"{metric_name}: {metric_value}")
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4) # Línea de perfecta predicción
plt.xlabel('Valores Reales')
plt.ylabel('Predicciones')
plt.title(f'Valores Reales vs. Predicciones')
sns.despine(top=True, right=True, left=False, bottom=False)
plt.show()
MAE: 1.572 MSE: 71.577 RMSE: 8.46 R2: 0.975
# Curva de Aprendizaje
train_sizes, train_scores, valid_scores = learning_curve(
baggingModel, XtrainScaled, y_train, train_sizes=np.linspace(0.01, 1.0, 40), cv=5,
scoring="neg_root_mean_squared_error")
train_errors = -train_scores.mean(axis=1)
valid_errors = -valid_scores.mean(axis=1)
plt.plot(train_sizes, train_errors, "r-+", linewidth=2, label="train")
plt.plot(train_sizes, valid_errors, "b-", linewidth=2, label="valid")
plt.xlabel('Tamaño del conjunto de entrenamiento')
plt.ylabel('RMSE')
plt.title('Curva de aprendizaje')
plt.legend()
sns.despine(top=True, right=True, left=False, bottom=False)
plt.show()
Un R2 de 0.97 indica que el modelo puede explicar el 97% de la variabilidad en los datos de prueba, lo cual es excelente. Los valores bajos de MAE y RMSE indican que los errores de predicción son pequeños en promedio. La diferencia entre las métricas de entrenamiento y prueba no es muy grande, lo que sugiere que no hay un sobreajuste significativo.
from sklearn.ensemble import GradientBoostingRegressor
Niterations = [500, 1000, 1500, 2000]
learningRate = [0.1, 0.05]
# mantenemos max_depth estático: max_depth=3
param_grid = {'n_estimators': Niterations,'learning_rate':learningRate }
grid = GridSearchCV(
GradientBoostingRegressor(random_state = 0, max_depth = 3),
param_grid=param_grid, cv = 5, verbose = 2)
grid.fit(X_train, y_train)
print("best mean cross-validation score: {:.3f}".format(grid.best_score_))
print("best parameters: {}".format(grid.best_params_))
Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] END ................learning_rate=0.1, n_estimators=500; total time= 8.0s
[CV] END ................learning_rate=0.1, n_estimators=500; total time= 8.2s
[CV] END ................learning_rate=0.1, n_estimators=500; total time= 8.2s
[CV] END ................learning_rate=0.1, n_estimators=500; total time= 8.1s
[CV] END ................learning_rate=0.1, n_estimators=500; total time= 8.1s
[CV] END ...............learning_rate=0.1, n_estimators=1000; total time= 15.9s
[CV] END ...............learning_rate=0.1, n_estimators=1000; total time= 16.0s
[CV] END ...............learning_rate=0.1, n_estimators=1000; total time= 15.8s
[CV] END ...............learning_rate=0.1, n_estimators=1000; total time= 15.8s
[CV] END ...............learning_rate=0.1, n_estimators=1000; total time= 16.3s
[CV] END ...............learning_rate=0.1, n_estimators=1500; total time= 24.1s
[CV] END ...............learning_rate=0.1, n_estimators=1500; total time= 24.3s
[CV] END ...............learning_rate=0.1, n_estimators=1500; total time= 24.0s
[CV] END ...............learning_rate=0.1, n_estimators=1500; total time= 23.8s
[CV] END ...............learning_rate=0.1, n_estimators=1500; total time= 24.1s
[CV] END ...............learning_rate=0.1, n_estimators=2000; total time= 31.7s
[CV] END ...............learning_rate=0.1, n_estimators=2000; total time= 31.6s
[CV] END ...............learning_rate=0.1, n_estimators=2000; total time= 32.1s
[CV] END ...............learning_rate=0.1, n_estimators=2000; total time= 31.5s
[CV] END ...............learning_rate=0.1, n_estimators=2000; total time= 31.8s
[CV] END ...............learning_rate=0.05, n_estimators=500; total time= 8.0s
[CV] END ...............learning_rate=0.05, n_estimators=500; total time= 7.9s
[CV] END ...............learning_rate=0.05, n_estimators=500; total time= 7.9s
[CV] END ...............learning_rate=0.05, n_estimators=500; total time= 8.2s
[CV] END ...............learning_rate=0.05, n_estimators=500; total time= 7.9s
[CV] END ..............learning_rate=0.05, n_estimators=1000; total time= 16.1s
[CV] END ..............learning_rate=0.05, n_estimators=1000; total time= 16.0s
[CV] END ..............learning_rate=0.05, n_estimators=1000; total time= 15.9s
[CV] END ..............learning_rate=0.05, n_estimators=1000; total time= 16.3s
[CV] END ..............learning_rate=0.05, n_estimators=1000; total time= 16.0s
[CV] END ..............learning_rate=0.05, n_estimators=1500; total time= 23.8s
[CV] END ..............learning_rate=0.05, n_estimators=1500; total time= 24.2s
[CV] END ..............learning_rate=0.05, n_estimators=1500; total time= 23.8s
[CV] END ..............learning_rate=0.05, n_estimators=1500; total time= 24.1s
[CV] END ..............learning_rate=0.05, n_estimators=1500; total time= 24.1s
[CV] END ..............learning_rate=0.05, n_estimators=2000; total time= 32.1s
[CV] END ..............learning_rate=0.05, n_estimators=2000; total time= 32.0s
[CV] END ..............learning_rate=0.05, n_estimators=2000; total time= 31.8s
[CV] END ..............learning_rate=0.05, n_estimators=2000; total time= 31.7s
[CV] END ..............learning_rate=0.05, n_estimators=2000; total time= 31.8s
best mean cross-validation score: 0.982
best parameters: {'learning_rate': 0.1, 'n_estimators': 2000}
# Gradient Boosting Regressor
lrOptimo = grid.best_params_['learning_rate']
neOptimo = grid.best_params_['n_estimators']
bt = GradientBoostingRegressor(random_state=0, max_depth=3,learning_rate=lrOptimo, n_estimators=neOptimo)
bt.fit(XtrainScaled,y_train)
error = 1-grid.cv_results_['mean_test_score'].reshape(len(learningRate),len(Niterations))
colors = ['r','b','g','k','m']
for i,lr in enumerate(learningRate):
plt.plot(Niterations,error[i,:],colors[i] + '--o',label='lr = %g'%lr)
plt.legend()
plt.xlabel('Iteraciones')
plt.ylabel('5-fold CV Error')
plt.title('train: %0.3f\ntest: %0.3f'%(bt.score(XtrainScaled,y_train),bt.score(XtestScaled,y_test)))
sns.despine(top=True, right=True, left=False, bottom=False)
plt.grid()
plt.show()
print("Train: ", bt.score(XtrainScaled,y_train))
print("Test: ", bt.score(XtestScaled,y_test))
Train: 0.9997283072534239 Test: 0.9947365442926436
y_pred = bt.predict(XtestScaled)
# Métricas
metrics = compute_metrics(y_test, y_pred)
for metric_name, metric_value in metrics.items():
print(f"{metric_name}: {metric_value}")
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4) # Línea de perfecta predicción
plt.xlabel('Valores Reales')
plt.ylabel('Predicciones')
plt.title(f'Valores Reales vs. Predicciones')
sns.despine(top=True, right=True, left=False, bottom=False)
plt.show()
MAE: 1.165 MSE: 15.236 RMSE: 3.903 R2: 0.995
# Curva de Aprendizaje
train_sizes, train_scores, valid_scores = learning_curve(
bt, XtrainScaled, y_train, train_sizes=np.linspace(0.01, 1.0, 40), cv=5,
scoring="neg_root_mean_squared_error")
train_errors = -train_scores.mean(axis=1)
valid_errors = -valid_scores.mean(axis=1)
plt.plot(train_sizes, train_errors, "r-+", linewidth=2, label="train")
plt.plot(train_sizes, valid_errors, "b-", linewidth=2, label="valid")
plt.xlabel('Tamaño del conjunto de entrenamiento')
plt.ylabel('RMSE')
plt.title('Curva de aprendizaje')
plt.legend()
sns.despine(top=True, right=True, left=False, bottom=False)
plt.show()
El modelo Gradient Boosting Regressor muestra un excelente desempeño, con un R² de 0.995 en el conjunto de prueba, lo que indica una sólida capacidad para explicar la variabilidad de los datos y proporcionar predicciones precisas. El bajo MAE y RMSE confirman la precisión del modelo, mientras que una puntuación alta en la validación cruzada refleja su robustez y buena generalización.
Estos resultados, junto con la cuidadosa selección de hiperparámetros, sugieren que el modelo es altamente efectivo para la tarea de predicción en cuestión. Sin embargo, siempre es prudente realizar una validación adicional para asegurar la consistencia del modelo en diferentes conjuntos de datos.
from sklearn.svm import SVR
vectorC = np.logspace(-2, 2, 10)
vectorG = np.logspace(-5, 1, 8)
param_grid = {'C': vectorC, 'gamma': vectorG}
grid = GridSearchCV(SVR(kernel='rbf'), param_grid=param_grid, cv = 5, verbose=1)
grid.fit(XtrainScaled, y_train)
Fitting 5 folds for each of 80 candidates, totalling 400 fits
GridSearchCV(cv=5, estimator=SVR(),
param_grid={'C': array([1.00000000e-02, 2.78255940e-02, 7.74263683e-02, 2.15443469e-01,
5.99484250e-01, 1.66810054e+00, 4.64158883e+00, 1.29154967e+01,
3.59381366e+01, 1.00000000e+02]),
'gamma': array([1.00000000e-05, 7.19685673e-05, 5.17947468e-04, 3.72759372e-03,
2.68269580e-02, 1.93069773e-01, 1.38949549e+00, 1.00000000e+01])},
verbose=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=5, estimator=SVR(),
param_grid={'C': array([1.00000000e-02, 2.78255940e-02, 7.74263683e-02, 2.15443469e-01,
5.99484250e-01, 1.66810054e+00, 4.64158883e+00, 1.29154967e+01,
3.59381366e+01, 1.00000000e+02]),
'gamma': array([1.00000000e-05, 7.19685673e-05, 5.17947468e-04, 3.72759372e-03,
2.68269580e-02, 1.93069773e-01, 1.38949549e+00, 1.00000000e+01])},
verbose=1)SVR()
SVR()
print("best mean cross-validation score: {:.3f}".format(grid.best_score_))
print("best parameters: {}".format(grid.best_params_))
# Mostramos prestaciones en CV
scores = grid.cv_results_['mean_test_score'].reshape(len(vectorC),len(vectorG))
plt.figure(figsize=(10,6))
plt.imshow(scores, interpolation='nearest', vmin= 0.6, vmax=0.9)
plt.xlabel('log(gamma)')
plt.ylabel('log(C)')
plt.colorbar()
plt.xticks(np.arange(len(vectorG)), np.log10(vectorG), rotation=90)
plt.yticks(np.arange(len(vectorC)), np.log10(vectorC))
plt.title('5-fold accuracy')
plt.show()
best mean cross-validation score: 0.927
best parameters: {'C': 100.0, 'gamma': 0.003727593720314938}
Copt = grid.best_params_['C']
Gopt = grid.best_params_['gamma']
svmModel = SVR(kernel='rbf',gamma = Gopt, C = Copt).fit(XtrainScaled,y_train)
print('Acc (TEST): %0.2f'%svmModel.score(XtestScaled,y_test))
Acc (TEST): 0.96
# Predicción
y_pred = svmModel.predict(XtestScaled)
# Métricas
metrics = compute_metrics(y_test, y_pred)
for metric_name, metric_value in metrics.items():
print(f"{metric_name}: {metric_value}")
plt.scatter(y_test, y_pred, color='mediumaquamarine', label='SVMs Predictions')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2) # Línea de identidad
plt.title(f'Valores Reales vs Predicciones')
plt.xlabel('Valores Reales')
plt.ylabel('Predicciones')
plt.legend()
sns.despine(top=True, right=True, left=False, bottom=False)
plt.show()
MAE: 4.121 MSE: 116.13 RMSE: 10.776 R2: 0.96
print("Train: ", svmModel.score(XtrainScaled,y_train))
print("Test: ", svmModel.score(XtestScaled,y_test))
Train: 0.9466135230762418 Test: 0.9598823359046609
El análisis del modelo SVM (Máquinas de Vectores de Soporte) revela un rendimiento notablemente alto en la predicción de los datos, con un coeficiente R² de 0.96 en el conjunto de prueba, lo que indica una buena capacidad del modelo para explicar la variabilidad de los datos.
Las métricas de error, como el MAE y el RMSE, muestran que las predicciones del modelo tienen un grado razonable de precisión, aunque el RMSE algo elevado sugiere que existen algunos errores de predicción significativos.
Los valores de entrenamiento y prueba están muy alineados, lo que demuestra que el modelo generaliza bien a nuevos datos y no sufre de sobreajuste. La alta puntuación en la validación cruzada confirma la robustez del modelo, y los parámetros optimizados indican un ajuste adecuado.
from sklearn.kernel_ridge import KernelRidge
# Definimos y entrenamos el modelo
kr = GridSearchCV(
KernelRidge(kernel="rbf", gamma=0.1),
param_grid={"alpha": [1e0, 0.1, 1e-2, 1e-3], "gamma": np.logspace(-2, 2, 5)},
)
kr.fit(XtrainScaled, y_train)
# Mejores parámetros y mejor puntuación
print("Mejores parámetros:", kr.best_params_)
print("Mejor puntuación (neg_mean_squared_error):", kr.best_score_)
kr_best = kr.best_estimator_
# Predicción
y_pred = kr.predict(XtestScaled)
# Métricas
metrics = compute_metrics(y_test, y_pred)
for metric_name, metric_value in metrics.items():
print(f"{metric_name}: {metric_value}")
plt.scatter(y_test, y_pred, color='mediumaquamarine', label='Kernel Ridge Predictions')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2) # Línea de identidad
plt.title(f'Valores Reales vs Predicciones')
plt.xlabel('Valores Reales')
plt.ylabel('Predicciones')
plt.legend()
sns.despine(top=True, right=True, left=False, bottom=False)
plt.show()
Mejores parámetros: {'alpha': 0.001, 'gamma': 0.01}
Mejor puntuación (neg_mean_squared_error): 0.9747203739235382
MAE: 0.61
MSE: 6.214
RMSE: 2.493
R2: 0.998
print("Train: ", kr.score(XtrainScaled,y_train))
print("Test: ", kr.score(XtestScaled,y_test))
Train: 0.9990038605388917 Test: 0.9978534759363776
Los resultados obtenidos del modelo Kernel Ridge Regression con los mejores parámetros {'alpha': 0.001, 'gamma': 0.01} demuestran un rendimiento excepcionalmente alto, evidenciado por un R² de 0.998 en el conjunto de prueba. Esto indica que el modelo es capaz de explicar prácticamente toda la variabilidad en los precios de los alojamientos.
La puntuación de validación cruzada de 0.974, junto con valores bajos de MAE y RMSE, refuerza la precisión y la robustez del modelo. Estos resultados sugieren que este, con la configuración óptima de hiperparámetros, ofrece una herramienta poderosa y precisa para la tarea de predicción en este contexto.
lr = LinearRegression().fit(XtrainScaled, y_train)
models = [lr, treeModel, randomForest, baggingModel, bt, svmModel, kr]
model_names = ['Linear Regression', 'Decision Tree', 'Random Forest', 'Bagging', 'Boosting', 'SVM', 'KRidge']
# Entrenar y predecir con cada modelo
predictions = {}
for model, name in zip(models, model_names):
model.fit(XtrainScaled, y_train)
predictions[name] = model.predict(XtestScaled)
ncols = 2 # Nº de columnas para subplot
nrows = (len(models) + ncols - 1) // ncols # Nº de filas calculado en base a la cantidad de modelos y el nº de columnas
# Crear una figura y ejes para subplots
fig, axs = plt.subplots(nrows, ncols, figsize=(10, 4 * nrows))
axs = axs.ravel() # Convertir la matriz de ejes en un array 1D para una fácil iteración
# Trazar cada modelo en su propio subplot y calcular métricas
for idx, (name, y_pred) in enumerate(predictions.items()):
metrics = compute_metrics(y_test, y_pred)
metric_text = "\n".join([f"{metric_name}: {metric_value:.2f}" for metric_name, metric_value in metrics.items()])
# Trazar predicciones vs valores reales
axs[idx].scatter(y_test, y_pred, alpha=0.3)
axs[idx].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
axs[idx].set_title(f'{name}\n{metric_text}', fontsize='9')
axs[idx].set_xlabel('Valores Reales', fontsize='9')
axs[idx].set_ylabel('Predicciones', fontsize='9')
plt.tight_layout()
plt.show()
from sklearn.model_selection import learning_curve
def plot_learning_curve(model, XtrainScaled, y_train, ax, model_name):
train_sizes, train_scores, test_scores = learning_curve(
model, XtrainScaled, y_train, train_sizes=np.linspace(0.1, 1.0, 10), cv=5, scoring='neg_mean_squared_error'
)
# Calcula la media y el intervalo de confianza del score de entrenamiento y de prueba para cada tamaño de conjunto de entrenamiento
train_scores_mean = -np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = -np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
# Grafica las curvas de aprendizaje
ax.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
ax.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
# Rellena entre las desviaciones estándar
ax.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1, color="r")
ax.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
# Etiquetas y leyendas
ax.set_title('Curva de aprendizaje: ' + model_name)
ax.set_xlabel('Tamaño del conjunto de entrenamiento')
ax.set_ylabel('MSE Negativo')
ax.legend(loc="best")
# Crea una figura para graficar
fig, axes = plt.subplots(3, 2, figsize=(15, 15)) # Ajusta según la cantidad de modelos que tienes
# Lista de modelos a evaluar
models = [
(lr, 'Regresión Lineal'),
(treeModel, 'Árbol de Decisión'),
(randomForest, 'Random Forest'),
(baggingModel, 'Bagging'),
(bt, 'Boosting'),
(svmModel, 'SVM'),
(kr, 'kRidge')
]
# Grafica las curvas de aprendizaje de cada modelo
for ax, (model, name) in zip(axes.flatten(), models):
plot_learning_curve(model, XtrainScaled, y_train, ax, name)
plt.tight_layout()
plt.show()
Escrita, no numérica; un par de líneas es más que suficiente